Firstly we import the libraries and data needed for the ESC-10 project. Next we create two structures that will be useful to keep track of filenames, to manipulate data and play audio.
!pip install tensorflow-model-optimization
!pip uninstall numba
!pip install -U numba
# restart runtime
import pandas as pd
import librosa
import numpy as np
import IPython.display as ipd
import pathlib
import glob, os, zipfile
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from scipy.io import wavfile
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras import backend as K
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, LSTM, Dense, Reshape, Dropout, Flatten, TimeDistributed
import tensorflow.keras.regularizers as rg
import seaborn as sns
from keras.utils.vis_utils import plot_model
from matplotlib.gridspec import SubplotSpec
import string
from tensorflow.keras.utils import to_categorical
import torch
import pickle
import tensorflow_model_optimization as tfmot
import tempfile
from sklearn.metrics import classification_report
! git clone https://github.com/karolpiczak/ESC-50.git
Cloning into 'ESC-50'... remote: Enumerating objects: 4193, done. remote: Counting objects: 100% (57/57), done. remote: Compressing objects: 100% (45/45), done. remote: Total 4193 (delta 36), reused 28 (delta 12), pack-reused 4136 Receiving objects: 100% (4193/4193), 878.79 MiB | 15.34 MiB/s, done. Resolving deltas: 100% (283/283), done. Updating files: 100% (2011/2011), done.
from google.colab import drive
drive.mount("/content/gdrive/")
%cd gdrive/MyDrive
Mounted at /content/gdrive/ /content/gdrive/MyDrive
ecg_50_csv = pd.read_csv("/content/ESC-50/meta/esc50.csv")
print(ecg_50_csv.shape)
ecg_10_csv = ecg_50_csv[ecg_50_csv['esc10']==True]
print(ecg_10_csv.shape)
(2000, 7) (400, 7)
ecg_10_csv.head()
| filename | fold | target | category | esc10 | src_file | take | |
|---|---|---|---|---|---|---|---|
| 0 | 1-100032-A-0.wav | 1 | 0 | dog | True | 100032 | A |
| 14 | 1-110389-A-0.wav | 1 | 0 | dog | True | 110389 | A |
| 24 | 1-116765-A-41.wav | 1 | 41 | chainsaw | True | 116765 | A |
| 54 | 1-17150-A-12.wav | 1 | 12 | crackling_fire | True | 17150 | A |
| 55 | 1-172649-A-40.wav | 1 | 40 | helicopter | True | 172649 | A |
filename_list = []
for i in ecg_10_csv['filename']:
filename_list.append(i)
audio_arrays = []
sounds = []
path = "/content/ESC-50/audio"
for filename in sorted(glob.glob(os.path.join(path, "*.wav"))):
if pathlib.PurePath(filename).name in filename_list:
y, sample_rate = librosa.load(filename)
audio_arrays.append(y)
sounds.append(filename)
In this section we start from an exploratory analysis with some data integrity checks such as the one on the balance of the classes and then we can look some plots that will give us a first representation of how to visualize an audio. Following the preprocessing part has the goal of creating a final data structure that can be inserted into the models. The first step is the data augmentation: it is useful to increase the 400 original data through some transformations that will lead in our case to a dataset composed of 3600 observations. Before proceeding to the modeling section, it is obviously important to proceed with the normalization of the data and the splitting into y and X. In particular we will have two different X that depend on the type of approach with which we want to solve the classification. A first X is represented by features obtained from the Mel-frequency cepstral coefficients while the second X consists of the pixels of the spectrogram images.
categories = list(set(ecg_10_csv['category']))
print(categories)
['helicopter', 'chainsaw', 'crackling_fire', 'sneezing', 'rain', 'rooster', 'clock_tick', 'sea_waves', 'crying_baby', 'dog']
unique, counts = np.unique(ecg_10_csv['category'], return_counts = True)
y_freq = counts
print(unique, counts)
sns.barplot(x=y_freq, y=categories, palette="icefire")
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/balance_data.png')
['chainsaw' 'clock_tick' 'crackling_fire' 'crying_baby' 'dog' 'helicopter' 'rain' 'rooster' 'sea_waves' 'sneezing'] [40 40 40 40 40 40 40 40 40 40]
Now we want to explore the main properties of sounds. That's why we choose to generate three different plots: \
y, sr = librosa.load(sounds[12])
S, phase = librosa.magphase(librosa.stft(y))
rms = librosa.feature.rms(S=S)
fig, ax = plt.subplots(figsize=(18, 14), nrows=3, sharex=False)
times = librosa.times_like(rms)
ax[0].plot(y, color = "purple")
ax[0].set(title='Raw Sound')
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), x_axis='time', ax=ax[1])
ax[1].set(title='Spectrogram')
chromagram = librosa.feature.chroma_stft(y = audio_arrays[13], sr = sr,hop_length = 512, win_length = 1024, n_chroma = 60)
librosa.display.specshow(chromagram, y_axis='chroma', x_axis='time')
ax[2].set(title='Chromatogram')
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/EDA_first.png')
We create some transformation functions and use them, even combined with each other, to augment the data.
def noise_addiction(data, p):
new_data = []
wn = np.random.randn(len(data[0]))
for i in sorted(range(len(data))):
new_audio = data[i] + p*wn
new_data.append(new_audio)
return new_data
def time_shifting(data, q):
new_data = []
for i in sorted(range(len(data))):
new_audio = np.roll(data[i], q)
new_data.append(new_audio)
return new_data
def time_stretching(data, s):
new_data = []
for i in sorted(range(len(data))):
new_audio = librosa.effects.time_stretch(audio_arrays[i], rate = s)
new_data.append(new_audio)
return new_data
def pitch_shifting(data, r):
new_data = []
for i in sorted(range(len(data))):
new_audio = librosa.effects.pitch_shift(audio_arrays[i],sr = r,n_steps=-5)
new_data.append(new_audio)
return new_data
def data_augmentation(data):
noise_add = noise_addiction(data, 0.05)
time_shift = time_shifting(data, 50000)
time_stretch = time_stretching(data, 2)
pitch_shift = pitch_shifting(data, 5)
noise_time_shift = noise_addiction(time_shift, 0.05)
time_stretch_pitch_shift = pitch_shifting(time_stretch, 5)
time_stretch_shift = time_stretching(time_shift, 2)
noise_pitch_shift = noise_addiction(pitch_shift, 0.05)
new_data = data + noise_add + time_shift + time_stretch + pitch_shift + noise_time_shift + time_stretch_pitch_shift + time_stretch_shift + noise_pitch_shift
sample = (y - y.min()) / (y.max() - y.min())
return new_data
new_data = data_augmentation(audio_arrays)
len(new_data[0])
110250
Here a sample audio is displayed through sound waves and played in its original form and after being transformed in the 8 different ways.
fig, axs = plt.subplots(3, 3)
axs[0, 0].plot(np.linspace(0, 1, len(new_data[47])), new_data[47], color='xkcd:muted purple')
axs[0, 0].set_title("Original audio", fontsize=10)
axs[0, 1].plot(np.linspace(0, 1, len(new_data[447])), new_data[447], color='xkcd:muted purple')
axs[0, 1].set_title("Noise addiction", fontsize=10)
axs[0, 2].plot(np.linspace(0, 1, len(new_data[847])), new_data[847], color='xkcd:muted purple')
axs[0, 2].set_title("Time shifting", fontsize=10)
axs[1, 0].plot(np.linspace(0, 1, len(new_data[1247])), new_data[1247], color='xkcd:muted purple')
axs[1, 0].set_title("Time stretching", fontsize=10)
axs[1, 1].plot(np.linspace(0, 1, len(new_data[1647])), new_data[1647], color='xkcd:muted purple')
axs[1, 1].set_title("Pitch shifting", fontsize=10)
axs[1, 2].plot(np.linspace(0, 1, len(new_data[2047])), new_data[2047], color='xkcd:muted purple')
axs[1, 2].set_title("Noise and time shifting", fontsize=10)
axs[2, 0].plot(np.linspace(0, 1, len(new_data[2447])), new_data[2447], color='xkcd:muted purple')
axs[2, 0].set_title("Time stretch. and pitch shift.", fontsize=10)
axs[2, 1].plot(np.linspace(0, 1, len(new_data[2847])), new_data[2847], color='xkcd:muted purple')
axs[2, 1].set_title("Time stretch. and shift.", fontsize=10)
axs[2, 2].plot(np.linspace(0, 1, len(new_data[3247])), new_data[3247], color='xkcd:muted purple')
axs[2, 2].set_title("Noise and pitch shift.", fontsize=10)
fig.tight_layout()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/augmented_data.png')
# original audio
ipd.Audio(new_data[47], rate=sample_rate)
# noise addiction
ipd.Audio(new_data[447], rate=sample_rate)
# time shifting
ipd.Audio(new_data[847], rate=sample_rate)
# time stretching
ipd.Audio(new_data[1247], rate=sample_rate)
# pitch shifting
ipd.Audio(new_data[1647], rate=sample_rate)
# noise addiction and time shifting
ipd.Audio(new_data[2047], rate=sample_rate)
# time stretching and pitch shifting
ipd.Audio(new_data[2447], rate=sample_rate)
# time shifting and time stretching
ipd.Audio(new_data[2847], rate=sample_rate)
# noise addiction and pitch shifting
ipd.Audio(new_data[3247], rate=sample_rate)
def normalize_data(data):
new_data = []
for i in data:
sample = (i - i.min()) / (i.max() - i.min())
new_data.append(sample)
return new_data
new_data = normalize_data(new_data)
#file_path0 = '/content/gdrive/MyDrive/ESC-10 material/audio_data.npy'
#np.save(file_path0, new_data)
ecg_10 = ecg_10_csv["category"]
labels = []
for j in range (1,10):
for i in ecg_10:
labels.append(i)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
y_cat = tf.keras.utils.to_categorical(y)
print(y.shape, y_cat.shape)
# to keep track of original labels
mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print(mapping)
(3600,) (3600, 10)
{'chainsaw': 0, 'clock_tick': 1, 'crackling_fire': 2, 'crying_baby': 3, 'dog': 4, 'helicopter': 5, 'rain': 6, 'rooster': 7, 'sea_waves': 8, 'sneezing': 9}
This plot represents the frequency content of an audio signal through melspectrograms and mfccs and the amplitude changes through the chromogram of an example for each class.
df = pd.DataFrame({'X' : new_data, 'y' : y})
first_obs = df.groupby('y').first()
fig, ax = plt.subplots(3, 10, figsize=(40,10))
grid = plt.GridSpec(3, 10)
def create_subtitle(fig: plt.Figure, grid: SubplotSpec, title: str):
row = fig.add_subplot(grid)
row.set_title(f'{title}\n', fontweight='semibold')
row.set_frame_on(False)
row.axis('off')
for k in range(first_obs.shape[0]):
create_subtitle(fig, grid[::, k], string.capwords(categories[k]))
for i in range(first_obs.shape[0]):
audio_array = np.stack(first_obs.iloc[i].to_numpy())[0]
librosa.display.specshow(librosa.power_to_db(librosa.feature.melspectrogram(y=audio_array, sr = sample_rate, n_mels = 128)), ax=ax[0,i], cmap = "magma")
ax[0,i].set_xlabel("Time $(s)$")
ax[0,i].set_ylabel("Frequency $(Hz)$")
ax[0,i].set_title("Mel-spectrogram")
librosa.display.specshow(librosa.feature.chroma_stft(y=audio_array, sr=sample_rate, hop_length=512, win_length=1024, n_chroma=60), cmap="magma", ax=ax[1,i])
ax[1,i].set_xlabel("Time $(s)$")
ax[1,i].set_ylabel("Pitch Classes")
ax[1,i].set_title("Chromagram")
librosa.display.specshow(librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=60, hop_length=512), cmap="magma", ax=ax[2,i])
ax[2,i].set_xlabel("Time (s)")
ax[2,i].set_ylabel("MFCC coefficient")
ax[2,i].set_title("MFCC")
fig.savefig('/content/gdrive/MyDrive/ESC-10 material/full_figure.png')
/usr/local/lib/python3.10/dist-packages/librosa/core/pitch.py:102: UserWarning: Trying to estimate tuning from empty frequency set. return pitch_tuning(
The first X is composed of the MFCCs, which consist of compact numerical descriptions of the spectral content of an audio signal. After trying a few configurations, the best representation of the ESC-10 data is as follows.
def mfccs_function(data):
extracted_features = []
for i in sorted(range(len(data))):
melspectrogram = librosa.feature.melspectrogram(y = data[i], sr = sample_rate)
logamplitude = librosa.amplitude_to_db(melspectrogram)
# hop_length is the default spacing between frames
# n_mfcc is the number of features
mfccs = librosa.feature.mfcc(S = logamplitude, n_mfcc = 60, hop_length = 512, win_length = 1024)
new_chromagram = librosa.feature.chroma_stft(y = data[i], sr = sample_rate, hop_length = 512, win_length = 1024, n_chroma = 60)
new_delta = librosa.feature.delta(mfccs)
new_instance = np.dstack((mfccs, new_chromagram, new_delta))
extracted_features.append(new_instance)
return extracted_features
X = mfccs_function(new_data)
print(X[0].shape)
(60, 216, 3)
After the augmentation, the speeded up audios are shorter, with this function we resize those data in such a way to have the same 5 seconds length for all audios.
def fix_function(data):
new_X = []
for i in sorted(range(len(data))):
if data[i].shape == (60, 108, 3):
new_X.append(np.concatenate((data[i], data[i]), axis=1))
else:
new_X.append(data[i])
X = np.array(new_X)
return X
X = fix_function(X)
"""
from scipy.io import wavfile
directory = 'ESC_10_sounds'
if not os.path.exists(directory):
os.makedirs(directory)
for i, audio_array in enumerate(new_data):
filename = os.path.join(directory, f'{i}.wav')
wavfile.write(filename, sample_rate, audio_array)
"""
"""
def create_spectrogram(audio_file, image_file):
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
y, sr = librosa.load(audio_file)
S, phase = librosa.magphase(librosa.stft(y))
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), x_axis='time')
fig.savefig(image_file)
plt.close(fig)
def create_pngs_from_wavs(input_path, output_path):
if not os.path.exists(output_path):
os.makedirs(output_path)
dir = os.listdir(input_path)
for i, file in enumerate(dir):
input_file = os.path.join(input_path, file)
output_file = os.path.join(output_path, file.replace('.wav', '.png'))
create_spectrogram(input_file, output_file)
create_pngs_from_wavs('/content/ESC_10_sounds', '/content/Spectrograms')
"""
"""
from google.colab import files
dir_to_zip = 'Spectrograms' #@param {type: "string"}
output_filename = 'spectrogram.zip' #@param {type: "string"}
delete_dir_after_download = "No" #@param ['Yes', 'No']
os.system( "zip -r {} {}".format( output_filename , dir_to_zip ) )
if delete_dir_after_download == "Yes":
os.system( "rm -r {}".format( dir_to_zip ) )
files.download( output_filename )
"""
import shutil
shutil.copy("/content/gdrive/MyDrive/spectrogram.zip", "/content")
'/content/spectrogram.zip'
!unzip /content/spectrogram.zip -d /content;
Archive: /content/spectrogram.zip replace /content/Spectrograms/1173.png? [y]es, [n]o, [A]ll, [N]one, [r]ename:
from keras.preprocessing import image
import keras.utils as image
def load_images_from_path(path):
images = []
for file in sorted(os.listdir(path), key=lambda x: int(x.split('.')[0])):
#for file in os.listdir(path):
images.append(image.img_to_array(image.load_img(os.path.join(path, file), target_size=(224, 224, 3))))
return images
x1 = []
images = load_images_from_path('/content/Spectrograms')
x1 += images
X_spec = np.array(x1) / 255
import resource
# Time of inference
def measure_inference_time(model, data):
start_time = time.time()
model.predict(data)
end_time = time.time()
inference_time = end_time - start_time
return inference_time
# Size of gzipped model, in bytes
def get_gzipped_model_size(file):
_, zipped_file = tempfile.mkstemp('.zip')
with zipfile.ZipFile(zipped_file, 'w', compression=zipfile.ZIP_DEFLATED) as f:
f.write(file)
return os.path.getsize(zipped_file)
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
K.set_session(sess)
from keras.initializers import glorot_uniform
seed_value = 64
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
val_size = .15
test_size = .15
X_train, X_test_val, y_cat_train, y_cat_test_val = train_test_split(X, y_cat, test_size=val_size+test_size, random_state=42, stratify=y_cat)
X_val, X_test, y_cat_val, y_cat_test = train_test_split(X_test_val, y_cat_test_val, test_size=test_size/(test_size+val_size), random_state=42, stratify=y_cat_test_val)
print(X_train.shape, X_test.shape, X_val.shape, y_cat_train.shape, y_cat_test.shape, y_cat_val.shape)
(2520, 60, 216, 3) (540, 60, 216, 3) (540, 60, 216, 3) (2520, 10) (540, 10) (540, 10)
file_path1 = '/content/gdrive/MyDrive/ESC-10 material/X_test_cnn_mfcc.npy'
file_path2 = '/content/gdrive/MyDrive/ESC-10 material/y_test_cnn_mfcc.npy'
file_path3 = '/content/gdrive/MyDrive/ESC-10 material/X_train_cnn_mfcc.npy'
file_path4 = '/content/gdrive/MyDrive/ESC-10 material/y_train_cnn_mfcc.npy'
file_path5 = '/content/gdrive/MyDrive/ESC-10 material/X_val_cnn_mfcc.npy'
file_path6 = '/content/gdrive/MyDrive/ESC-10 material/y_val_cnn_mfcc.npy'
np.save(file_path1, X_test)
np.save(file_path2, y_cat_test)
np.save(file_path3, X_train)
np.save(file_path4, y_cat_train)
np.save(file_path5, X_val)
np.save(file_path6, y_cat_val)
#Per la RNN
X_train_rnn = np.transpose(np.squeeze(X_train[..., :1]), (0, 2, 1))
X_val_rnn = np.transpose(np.squeeze(X_val[..., :1]), (0, 2, 1))
X_test_rnn = np.transpose(np.squeeze(X_test[..., :1]), (0, 2, 1))
print(X_train_rnn.shape, X_val_rnn.shape, X_test_rnn.shape)
(2520, 216, 60) (540, 216, 60) (540, 216, 60)
y_train = np.argmax(y_cat_train, axis = 1)
y_val = np.argmax(y_cat_val, axis = 1)
y_test = np.argmax(y_cat_test, axis = 1)
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=4,
mode='min',
restore_best_weights=True
)
model1 = Sequential([
layers.Conv2D(32, (3,3), activation = 'relu', padding='valid', input_shape = X_train.shape[1:], strides=(2, 2), kernel_initializer=glorot_uniform(seed=seed_value)),
layers.MaxPooling2D(2, padding='valid'),
layers.Conv2D(64, (3,3), activation='relu', padding='valid'),
layers.MaxPooling2D(2, padding='valid'),
layers.Dropout(0.2),
layers.Conv2D(128, (2,2), activation='relu', padding='valid', kernel_regularizer=rg.l1(0.01)),
layers.MaxPooling2D(2, padding='valid'),
layers.Flatten(),
layers.Dense(128, activation='relu'),
layers.Dropout(0.2),
layers.Dense(10, activation = 'softmax')
])
model1.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'acc')
model1.summary()
Model: "sequential_3"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_9 (Conv2D) (None, 29, 107, 32) 896
max_pooling2d_9 (MaxPooling (None, 14, 53, 32) 0
2D)
conv2d_10 (Conv2D) (None, 12, 51, 64) 18496
max_pooling2d_10 (MaxPoolin (None, 6, 25, 64) 0
g2D)
dropout_6 (Dropout) (None, 6, 25, 64) 0
conv2d_11 (Conv2D) (None, 5, 24, 128) 32896
max_pooling2d_11 (MaxPoolin (None, 2, 12, 128) 0
g2D)
flatten_3 (Flatten) (None, 3072) 0
dense_6 (Dense) (None, 128) 393344
dropout_7 (Dropout) (None, 128) 0
dense_7 (Dense) (None, 10) 1290
=================================================================
Total params: 446,922
Trainable params: 446,922
Non-trainable params: 0
_________________________________________________________________
# Train model
history1 = model1.fit(X_train, y_cat_train, batch_size=64, epochs=100, validation_data=(X_val, y_cat_val), callbacks=[early_stopping], shuffle=False)
# Evaluate model
score1 = model1.evaluate(X_val, y_cat_val, verbose=0)
# Save model
model1.save('/content/gdrive/MyDrive/models/cnn_mfcc.h5')
_, keras_file1 = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model1, keras_file1, include_optimizer=False)
print('Saved baseline model to:', keras_file1)
Epoch 1/100 40/40 [==============================] - 12s 22ms/step - loss: 14.0587 - acc: 0.2917 - val_loss: 9.7520 - val_acc: 0.5407 Epoch 2/100 40/40 [==============================] - 0s 10ms/step - loss: 7.3951 - acc: 0.5492 - val_loss: 4.9981 - val_acc: 0.7185 Epoch 3/100 40/40 [==============================] - 0s 10ms/step - loss: 3.9801 - acc: 0.6671 - val_loss: 2.9242 - val_acc: 0.7500 Epoch 4/100 40/40 [==============================] - 0s 10ms/step - loss: 2.4962 - acc: 0.7056 - val_loss: 1.8449 - val_acc: 0.8278 Epoch 5/100 40/40 [==============================] - 0s 10ms/step - loss: 1.7145 - acc: 0.7440 - val_loss: 1.3222 - val_acc: 0.8315 Epoch 6/100 40/40 [==============================] - 0s 10ms/step - loss: 1.3155 - acc: 0.7639 - val_loss: 1.0423 - val_acc: 0.8278 Epoch 7/100 40/40 [==============================] - 0s 10ms/step - loss: 1.0436 - acc: 0.7940 - val_loss: 0.8423 - val_acc: 0.8537 Epoch 8/100 40/40 [==============================] - 0s 10ms/step - loss: 0.8997 - acc: 0.8044 - val_loss: 0.7917 - val_acc: 0.8370 Epoch 9/100 40/40 [==============================] - 0s 10ms/step - loss: 0.8411 - acc: 0.8258 - val_loss: 0.6861 - val_acc: 0.8759 Epoch 10/100 40/40 [==============================] - 0s 10ms/step - loss: 0.7382 - acc: 0.8377 - val_loss: 0.6627 - val_acc: 0.8667 Epoch 11/100 40/40 [==============================] - 0s 10ms/step - loss: 0.6941 - acc: 0.8409 - val_loss: 0.5489 - val_acc: 0.9111 Epoch 12/100 40/40 [==============================] - 0s 10ms/step - loss: 0.6324 - acc: 0.8615 - val_loss: 0.5185 - val_acc: 0.9000 Epoch 13/100 40/40 [==============================] - 0s 10ms/step - loss: 0.5858 - acc: 0.8726 - val_loss: 0.5507 - val_acc: 0.8889 Epoch 14/100 40/40 [==============================] - 0s 10ms/step - loss: 0.5415 - acc: 0.8821 - val_loss: 0.4793 - val_acc: 0.9074 Epoch 15/100 40/40 [==============================] - 0s 10ms/step - loss: 0.5083 - acc: 0.8901 - val_loss: 0.4542 - val_acc: 0.9167 Epoch 16/100 40/40 [==============================] - 0s 10ms/step - loss: 0.4665 - acc: 0.9044 - val_loss: 0.4327 - val_acc: 0.9204 Epoch 17/100 40/40 [==============================] - 0s 9ms/step - loss: 0.4642 - acc: 0.9099 - val_loss: 0.4381 - val_acc: 0.9167 Epoch 18/100 40/40 [==============================] - 0s 11ms/step - loss: 0.4604 - acc: 0.9103 - val_loss: 0.4221 - val_acc: 0.9185 Epoch 19/100 40/40 [==============================] - 0s 10ms/step - loss: 0.4269 - acc: 0.9163 - val_loss: 0.3966 - val_acc: 0.9333 Epoch 20/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3922 - acc: 0.9298 - val_loss: 0.3602 - val_acc: 0.9426 Epoch 21/100 40/40 [==============================] - 0s 10ms/step - loss: 0.3699 - acc: 0.9313 - val_loss: 0.3876 - val_acc: 0.9389 Epoch 22/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3723 - acc: 0.9298 - val_loss: 0.3676 - val_acc: 0.9296 Epoch 23/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3724 - acc: 0.9333 - val_loss: 0.3852 - val_acc: 0.9333 Epoch 24/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3756 - acc: 0.9381 - val_loss: 0.3331 - val_acc: 0.9444 Epoch 25/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3412 - acc: 0.9333 - val_loss: 0.3249 - val_acc: 0.9426 Epoch 26/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3069 - acc: 0.9488 - val_loss: 0.3361 - val_acc: 0.9389 Epoch 27/100 40/40 [==============================] - 0s 9ms/step - loss: 0.3339 - acc: 0.9409 - val_loss: 0.3997 - val_acc: 0.9204 Epoch 28/100 40/40 [==============================] - 0s 10ms/step - loss: 0.3194 - acc: 0.9444 - val_loss: 0.3112 - val_acc: 0.9481 Epoch 29/100 40/40 [==============================] - 0s 10ms/step - loss: 0.2850 - acc: 0.9508 - val_loss: 0.2985 - val_acc: 0.9407 Epoch 30/100 40/40 [==============================] - 0s 9ms/step - loss: 0.2664 - acc: 0.9607 - val_loss: 0.3445 - val_acc: 0.9296 Epoch 31/100 40/40 [==============================] - 0s 10ms/step - loss: 0.3194 - acc: 0.9504 - val_loss: 0.3007 - val_acc: 0.9519 Epoch 32/100 40/40 [==============================] - 0s 10ms/step - loss: 0.2950 - acc: 0.9516 - val_loss: 0.3034 - val_acc: 0.9556 Epoch 33/100 40/40 [==============================] - 0s 10ms/step - loss: 0.2954 - acc: 0.9536 - val_loss: 0.3217 - val_acc: 0.9426 Saved baseline model to: /tmp/tmp6jg1h4pu.h5
train_acc1 = history1.history['acc']
train_loss1 = history1.history['loss']
val_acc1 = history1.history['val_acc']
val_loss1 = history1.history['val_loss']
epochs1 = range(1, len(train_acc1) + 1)
plt.plot(epochs1, train_acc1, 'b', label='Training accuracy', color='darkmagenta')
plt.plot(epochs1, val_acc1, 'r', label='Validation accuracy', color='tab:orange')
plt.title('Training and validation accuracy CNN')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/acc_function_CNN_mfcc.png')
plt.show()
plt.plot(epochs1, train_loss1, 'b', label='Training loss', color='darkmagenta')
plt.plot(epochs1, val_loss1, 'r', label='Validation loss', color='tab:orange')
plt.title('Training and validation loss CNN')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/loss_function_CNN_mfcc.png')
plt.show()
<ipython-input-49-d30e1d7bec7c>:8: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs1, train_acc1, 'b', label='Training accuracy', color='darkmagenta') <ipython-input-49-d30e1d7bec7c>:9: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs1, val_acc1, 'r', label='Validation accuracy', color='tab:orange')
<ipython-input-49-d30e1d7bec7c>:17: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs1, train_loss1, 'b', label='Training loss', color='darkmagenta') <ipython-input-49-d30e1d7bec7c>:18: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs1, val_loss1, 'r', label='Validation loss', color='tab:orange')
Check accuracy on test set and all errors of the model
# Predict classes for test data
y_pred1 = model1.predict(X_test)
# Convert in the same form
y_pred1 = np.argmax(y_pred1, axis=1)
# Compare predictions to true labels
misclassified_indices = np.where(y_pred1 != y_test)[0]
print(misclassified_indices)
# evaluate the keras model
_, accuracy1 = model1.evaluate(X_test, y_cat_test)
initial_inference_time_1 = measure_inference_time(model1, X_test)
print('Accuracy: %.2f' % (accuracy1*100))
print("Time of inference:", round(initial_inference_time_1, 2))
print("Size of gzipped CNN with MFCC model: %.2f KB" % (get_gzipped_model_size(keras_file1)/ 1024))
17/17 [==============================] - 0s 3ms/step [ 2 6 24 32 67 137 146 153 173 199 224 242 244 259 274 292 350 366 380 391 397 404 414 427 431 447 455 461 463 465 504 524] 17/17 [==============================] - 0s 4ms/step - loss: 0.3342 - acc: 0.9407 17/17 [==============================] - 0s 3ms/step Accuracy: 94.07 Time of inference: 0.28 Size of gzipped CNN with MFCC model: 1622.69 KB
print(classification_report(y_test, y_pred1, target_names=mapping.keys()))
precision recall f1-score support
chainsaw 1.00 0.96 0.98 54
clock_tick 0.94 0.91 0.92 54
crackling_fire 0.91 0.91 0.91 54
crying_baby 0.93 1.00 0.96 54
dog 0.98 0.93 0.95 54
helicopter 0.82 0.94 0.88 54
rain 0.93 0.80 0.86 54
rooster 1.00 1.00 1.00 54
sea_waves 0.95 0.98 0.96 54
sneezing 0.96 0.98 0.97 54
accuracy 0.94 540
macro avg 0.94 0.94 0.94 540
weighted avg 0.94 0.94 0.94 540
r_cnn = confusion_matrix(y_test, y_pred1)
disp = ConfusionMatrixDisplay(confusion_matrix=r_cnn, display_labels=mapping)
fig, ax = plt.subplots(figsize=(6,6))
disp.plot(ax=ax)
ax.set_xticklabels(mapping.keys(), rotation='vertical')
plt.show()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/cnn_mfcc_conf_mat.png')
np.save('/content/gdrive/MyDrive/ESC-10 material/cm_cnn_mfcc.npy', r_cnn)
<Figure size 640x480 with 0 Axes>
Check a particular error and find sound
# indexes of a particular error
misclassifications = np.where((y_pred1 == 5) & (y_test == 6))[0]
print(misclassifications)
[ 2 67 274 350 463 465]
# listen the audio classified wrong
# the model mistakes a helicopter for rain
miscl = X_test[2]
idx = np.where(np.all(X == miscl, axis=(1, 2)))[0][0]
ipd.Audio(new_data[idx], rate=sample_rate)
model3 = Sequential([
layers.LSTM(units=256, input_shape=X_train_rnn.shape[1:]),
layers.Dropout(0.2),
layers.Dense(128, activation='relu', kernel_regularizer=rg.l1(0.001)),
layers.Dropout(0.2),
layers.Dense(64, activation='relu'),
layers.Dropout(0.2),
layers.Dense(10, activation = 'softmax')
])
model3.compile(loss = "categorical_crossentropy", optimizer = 'adam', metrics = 'acc')
model3.summary()
Model: "sequential_4"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 256) 324608
dropout_8 (Dropout) (None, 256) 0
dense_8 (Dense) (None, 128) 32896
dropout_9 (Dropout) (None, 128) 0
dense_9 (Dense) (None, 64) 8256
dropout_10 (Dropout) (None, 64) 0
dense_10 (Dense) (None, 10) 650
=================================================================
Total params: 366,410
Trainable params: 366,410
Non-trainable params: 0
_________________________________________________________________
# Train model
history3 = model3.fit(X_train_rnn, y_cat_train, batch_size=128, epochs=100, validation_data=(X_val_rnn, y_cat_val), callbacks=[early_stopping], shuffle=False)
# Evaluate model
score3 = model3.evaluate(X_val_rnn, y_cat_val, verbose=0)
# Save model
model3.save('/content/gdrive/MyDrive/models/rnn_mfcc.h5')
_, keras_file3 = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model3, keras_file3, include_optimizer=False)
print('Saved baseline model to:', keras_file3)
Epoch 1/100 20/20 [==============================] - 5s 55ms/step - loss: 4.0654 - acc: 0.2179 - val_loss: 3.5965 - val_acc: 0.3444 Epoch 2/100 20/20 [==============================] - 1s 29ms/step - loss: 3.3947 - acc: 0.3266 - val_loss: 2.9460 - val_acc: 0.4852 Epoch 3/100 20/20 [==============================] - 0s 24ms/step - loss: 2.8558 - acc: 0.4278 - val_loss: 2.5014 - val_acc: 0.5444 Epoch 4/100 20/20 [==============================] - 0s 23ms/step - loss: 2.4376 - acc: 0.5171 - val_loss: 2.1589 - val_acc: 0.6500 Epoch 5/100 20/20 [==============================] - 0s 24ms/step - loss: 2.1169 - acc: 0.5730 - val_loss: 1.8282 - val_acc: 0.6963 Epoch 6/100 20/20 [==============================] - 0s 24ms/step - loss: 1.8555 - acc: 0.6528 - val_loss: 1.5969 - val_acc: 0.7537 Epoch 7/100 20/20 [==============================] - 0s 24ms/step - loss: 1.6585 - acc: 0.6817 - val_loss: 1.4665 - val_acc: 0.7741 Epoch 8/100 20/20 [==============================] - 0s 23ms/step - loss: 1.5290 - acc: 0.7139 - val_loss: 1.3009 - val_acc: 0.7833 Epoch 9/100 20/20 [==============================] - 0s 23ms/step - loss: 1.3853 - acc: 0.7429 - val_loss: 1.2121 - val_acc: 0.7963 Epoch 10/100 20/20 [==============================] - 0s 23ms/step - loss: 1.2598 - acc: 0.7734 - val_loss: 1.1284 - val_acc: 0.8167 Epoch 11/100 20/20 [==============================] - 0s 23ms/step - loss: 1.1721 - acc: 0.7988 - val_loss: 1.0268 - val_acc: 0.8426 Epoch 12/100 20/20 [==============================] - 0s 23ms/step - loss: 1.0615 - acc: 0.8226 - val_loss: 0.9763 - val_acc: 0.8370 Epoch 13/100 20/20 [==============================] - 0s 23ms/step - loss: 0.9594 - acc: 0.8540 - val_loss: 0.9000 - val_acc: 0.8593 Epoch 14/100 20/20 [==============================] - 0s 23ms/step - loss: 0.9102 - acc: 0.8571 - val_loss: 0.8436 - val_acc: 0.8593 Epoch 15/100 20/20 [==============================] - 0s 24ms/step - loss: 0.8640 - acc: 0.8659 - val_loss: 0.8116 - val_acc: 0.8685 Epoch 16/100 20/20 [==============================] - 0s 24ms/step - loss: 0.7958 - acc: 0.8825 - val_loss: 0.7686 - val_acc: 0.8759 Epoch 17/100 20/20 [==============================] - 0s 23ms/step - loss: 0.7745 - acc: 0.8790 - val_loss: 0.7170 - val_acc: 0.8944 Epoch 18/100 20/20 [==============================] - 0s 24ms/step - loss: 0.7114 - acc: 0.9016 - val_loss: 0.6832 - val_acc: 0.8944 Epoch 19/100 20/20 [==============================] - 0s 24ms/step - loss: 0.6708 - acc: 0.9075 - val_loss: 0.6688 - val_acc: 0.8815 Epoch 20/100 20/20 [==============================] - 0s 24ms/step - loss: 0.6537 - acc: 0.9123 - val_loss: 0.6204 - val_acc: 0.9111 Epoch 21/100 20/20 [==============================] - 0s 24ms/step - loss: 0.5983 - acc: 0.9226 - val_loss: 0.5902 - val_acc: 0.9259 Epoch 22/100 20/20 [==============================] - 0s 24ms/step - loss: 0.5991 - acc: 0.9107 - val_loss: 0.5453 - val_acc: 0.9204 Epoch 23/100 20/20 [==============================] - 0s 24ms/step - loss: 0.5464 - acc: 0.9274 - val_loss: 0.5205 - val_acc: 0.9426 Epoch 24/100 20/20 [==============================] - 0s 24ms/step - loss: 0.5097 - acc: 0.9373 - val_loss: 0.5048 - val_acc: 0.9352 Epoch 25/100 20/20 [==============================] - 0s 23ms/step - loss: 0.4906 - acc: 0.9381 - val_loss: 0.4908 - val_acc: 0.9426 Epoch 26/100 20/20 [==============================] - 0s 23ms/step - loss: 0.4514 - acc: 0.9504 - val_loss: 0.4641 - val_acc: 0.9444 Epoch 27/100 20/20 [==============================] - 0s 23ms/step - loss: 0.4381 - acc: 0.9468 - val_loss: 0.4518 - val_acc: 0.9426 Epoch 28/100 20/20 [==============================] - 0s 23ms/step - loss: 0.4117 - acc: 0.9516 - val_loss: 0.4252 - val_acc: 0.9444 Epoch 29/100 20/20 [==============================] - 0s 23ms/step - loss: 0.3721 - acc: 0.9627 - val_loss: 0.4151 - val_acc: 0.9519 Epoch 30/100 20/20 [==============================] - 0s 23ms/step - loss: 0.3652 - acc: 0.9595 - val_loss: 0.4023 - val_acc: 0.9407 Epoch 31/100 20/20 [==============================] - 0s 23ms/step - loss: 0.3637 - acc: 0.9587 - val_loss: 0.4097 - val_acc: 0.9389 Epoch 32/100 20/20 [==============================] - 0s 23ms/step - loss: 0.3429 - acc: 0.9615 - val_loss: 0.3760 - val_acc: 0.9481 Epoch 33/100 20/20 [==============================] - 0s 23ms/step - loss: 0.3282 - acc: 0.9647 - val_loss: 0.3966 - val_acc: 0.9444 Epoch 34/100 20/20 [==============================] - 0s 22ms/step - loss: 0.3177 - acc: 0.9659 - val_loss: 0.4149 - val_acc: 0.9296 Epoch 35/100 20/20 [==============================] - 0s 23ms/step - loss: 0.3439 - acc: 0.9575 - val_loss: 0.3697 - val_acc: 0.9519 Epoch 36/100 20/20 [==============================] - 0s 23ms/step - loss: 0.3102 - acc: 0.9698 - val_loss: 0.3651 - val_acc: 0.9444 Epoch 37/100 20/20 [==============================] - 0s 23ms/step - loss: 0.3070 - acc: 0.9643 - val_loss: 0.3633 - val_acc: 0.9407 Epoch 38/100 20/20 [==============================] - 0s 23ms/step - loss: 0.2891 - acc: 0.9714 - val_loss: 0.3471 - val_acc: 0.9500 Epoch 39/100 20/20 [==============================] - 0s 23ms/step - loss: 0.2791 - acc: 0.9714 - val_loss: 0.3161 - val_acc: 0.9630 Epoch 40/100 20/20 [==============================] - 0s 23ms/step - loss: 0.2551 - acc: 0.9810 - val_loss: 0.3353 - val_acc: 0.9519 Epoch 41/100 20/20 [==============================] - 0s 24ms/step - loss: 0.2482 - acc: 0.9766 - val_loss: 0.3404 - val_acc: 0.9481 Epoch 42/100 20/20 [==============================] - 0s 24ms/step - loss: 0.2316 - acc: 0.9806 - val_loss: 0.3634 - val_acc: 0.9389 Epoch 43/100 20/20 [==============================] - 0s 24ms/step - loss: 0.2580 - acc: 0.9726 - val_loss: 0.3708 - val_acc: 0.9389 Saved baseline model to: /tmp/tmpbwz2qkh7.h5
train_acc3 = history3.history['acc']
train_loss3 = history3.history['loss']
val_acc3 = history3.history['val_acc']
val_loss3 = history3.history['val_loss']
epochs3 = range(1, len(train_acc3) + 1)
plt.plot(epochs3, train_acc3, 'b', label='Training accuracy', color='darkmagenta')
plt.plot(epochs3, val_acc3, 'r', label='Validation accuracy', color='tab:orange')
plt.title('Training and validation accuracy RNN')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/acc_function_RNN_mfcc.png')
plt.show()
plt.plot(epochs3, train_loss3, 'b', label='Training loss', color='darkmagenta')
plt.plot(epochs3, val_loss3, 'r', label='Validation loss', color='tab:orange')
plt.title('Training and validation loss RNN')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/loss_function_RNN_mfcc.png')
plt.show()
<ipython-input-75-188aaa5bde2b>:8: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs3, train_acc3, 'b', label='Training accuracy', color='darkmagenta') <ipython-input-75-188aaa5bde2b>:9: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs3, val_acc3, 'r', label='Validation accuracy', color='tab:orange')
<ipython-input-75-188aaa5bde2b>:17: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs3, train_loss3, 'b', label='Training loss', color='darkmagenta') <ipython-input-75-188aaa5bde2b>:18: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs3, val_loss3, 'r', label='Validation loss', color='tab:orange')
Check accuracy on test set and all errors of the model
# Predict classes for test data
y_pred3 = model3.predict(X_test_rnn)
# Convert in the same form
y_pred3 = np.argmax(y_pred3, axis=1)
# Compare predictions to true labels
misclassified_indices = np.where(y_pred3 != y_test)[0]
print(misclassified_indices)
# evaluate the keras model
_, accuracy3 = model3.evaluate(X_test_rnn, y_cat_test)
initial_inference_time_3 = measure_inference_time(model3, X_test_rnn)
print('Accuracy: %.2f' % (accuracy3*100))
print("Time of inference:", round(initial_inference_time_3, 2))
print("Size of gzipped RNN with MFCC model: %.2f KB" % (get_gzipped_model_size(keras_file3)/ 1024))
17/17 [==============================] - 0s 6ms/step [ 13 33 35 67 118 125 173 204 241 255 274 369 446 447 455 465 475 488 512 519 520 524] 17/17 [==============================] - 0s 7ms/step - loss: 0.3222 - acc: 0.9593 17/17 [==============================] - 0s 6ms/step Accuracy: 95.93 Time of inference: 0.2 Size of gzipped RNN with MFCC model: 1334.07 KB
print(classification_report(y_test, y_pred3, target_names=mapping.keys()))
precision recall f1-score support
chainsaw 1.00 0.98 0.99 54
clock_tick 0.88 0.94 0.91 54
crackling_fire 0.91 0.89 0.90 54
crying_baby 1.00 1.00 1.00 54
dog 0.96 0.96 0.96 54
helicopter 0.96 0.98 0.97 54
rain 0.94 0.93 0.93 54
rooster 1.00 1.00 1.00 54
sea_waves 0.95 0.96 0.95 54
sneezing 1.00 0.94 0.97 54
accuracy 0.96 540
macro avg 0.96 0.96 0.96 540
weighted avg 0.96 0.96 0.96 540
r_rnn = confusion_matrix(y_test, y_pred3)
disp = ConfusionMatrixDisplay(confusion_matrix=r_rnn, display_labels=mapping)
fig, ax = plt.subplots(figsize=(6,6))
disp.plot(ax=ax)
ax.set_xticklabels(mapping.keys(), rotation='vertical')
plt.show()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/rnn_mfcc_conf_mat.png')
np.save('/content/gdrive/MyDrive/ESC-10 material/cm_rnn_mfcc.npy', r_rnn)
<Figure size 640x480 with 0 Axes>
Check a particular error and find sound
# indexes of a particular error
misclassifications = np.where((y_pred3 == 1) & (y_test == 2))[0]
print(misclassifications)
[ 13 369 446 455]
# listen the audio classified wrong
# the model mistakes this crackling fire for clock tick
miscl = X_test[455]
idx = np.where(np.all(X == miscl, axis=(1, 2)))[0][0]
ipd.Audio(new_data[idx], rate=sample_rate)
model2 = Sequential([
layers.Conv2D(32, (4,4),activation = 'relu',padding='valid', input_shape=X_train.shape[1:], strides=(2,2), kernel_initializer=glorot_uniform(seed=seed_value)),
layers.MaxPooling2D(2, padding='same'),
layers.Dropout(0.3),
layers.Conv2D(128, (3,3), activation='relu',padding='valid', strides=(2,2), kernel_regularizer=rg.l1(0.001)),
layers.Dropout(0.2),
layers.Conv2D(64, (2,2), activation='relu',padding='valid', kernel_regularizer=rg.l1(0.001)),
layers.MaxPooling2D(2, padding='same'),
layers.Dropout(0.3),
layers.TimeDistributed(Flatten()),
layers.LSTM(64, return_sequences=False),
layers.Dropout(0.3),
layers.Dense(10, activation = 'softmax')
])
model2.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'acc')
model2.summary()
Model: "sequential_7"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_18 (Conv2D) (None, 29, 107, 32) 1568
max_pooling2d_16 (MaxPoolin (None, 15, 54, 32) 0
g2D)
dropout_19 (Dropout) (None, 15, 54, 32) 0
conv2d_19 (Conv2D) (None, 7, 26, 128) 36992
dropout_20 (Dropout) (None, 7, 26, 128) 0
conv2d_20 (Conv2D) (None, 6, 25, 64) 32832
max_pooling2d_17 (MaxPoolin (None, 3, 13, 64) 0
g2D)
dropout_21 (Dropout) (None, 3, 13, 64) 0
time_distributed_2 (TimeDis (None, 3, 832) 0
tributed)
lstm_3 (LSTM) (None, 64) 229632
dropout_22 (Dropout) (None, 64) 0
dense_13 (Dense) (None, 10) 650
=================================================================
Total params: 301,674
Trainable params: 301,674
Non-trainable params: 0
_________________________________________________________________
# Train model
history2 = model2.fit(X_train, y_cat_train, batch_size=64, epochs=100, validation_data=(X_val, y_cat_val), callbacks=[early_stopping], shuffle=False)
# Evaluate model
score2 = model2.evaluate(X_val, y_cat_val, verbose=0)
# Save model
model2.save('/content/gdrive/MyDrive/models/crnn_mfcc.h5')
_, keras_file2 = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model2, keras_file2, include_optimizer=False)
print('Saved baseline model to:', keras_file2)
Epoch 1/100 40/40 [==============================] - 4s 33ms/step - loss: 4.5531 - acc: 0.1710 - val_loss: 3.8973 - val_acc: 0.3278 Epoch 2/100 40/40 [==============================] - 1s 13ms/step - loss: 3.4999 - acc: 0.3290 - val_loss: 2.9137 - val_acc: 0.5074 Epoch 3/100 40/40 [==============================] - 0s 12ms/step - loss: 2.6855 - acc: 0.4567 - val_loss: 2.2731 - val_acc: 0.5648 Epoch 4/100 40/40 [==============================] - 0s 12ms/step - loss: 2.1417 - acc: 0.5536 - val_loss: 1.7866 - val_acc: 0.6685 Epoch 5/100 40/40 [==============================] - 0s 11ms/step - loss: 1.7824 - acc: 0.6139 - val_loss: 1.5069 - val_acc: 0.7296 Epoch 6/100 40/40 [==============================] - 1s 13ms/step - loss: 1.5063 - acc: 0.6893 - val_loss: 1.2873 - val_acc: 0.7630 Epoch 7/100 40/40 [==============================] - 1s 14ms/step - loss: 1.3411 - acc: 0.7032 - val_loss: 1.1243 - val_acc: 0.8037 Epoch 8/100 40/40 [==============================] - 0s 12ms/step - loss: 1.1958 - acc: 0.7448 - val_loss: 1.0348 - val_acc: 0.8111 Epoch 9/100 40/40 [==============================] - 0s 12ms/step - loss: 1.0704 - acc: 0.7627 - val_loss: 0.9187 - val_acc: 0.8315 Epoch 10/100 40/40 [==============================] - 1s 13ms/step - loss: 0.9973 - acc: 0.7774 - val_loss: 0.8461 - val_acc: 0.8370 Epoch 11/100 40/40 [==============================] - 0s 12ms/step - loss: 0.9097 - acc: 0.7944 - val_loss: 0.7507 - val_acc: 0.8685 Epoch 12/100 40/40 [==============================] - 0s 12ms/step - loss: 0.8101 - acc: 0.8278 - val_loss: 0.6802 - val_acc: 0.8667 Epoch 13/100 40/40 [==============================] - 1s 13ms/step - loss: 0.7626 - acc: 0.8349 - val_loss: 0.6447 - val_acc: 0.8833 Epoch 14/100 40/40 [==============================] - 1s 13ms/step - loss: 0.6843 - acc: 0.8619 - val_loss: 0.6245 - val_acc: 0.8556 Epoch 15/100 40/40 [==============================] - 1s 13ms/step - loss: 0.6595 - acc: 0.8647 - val_loss: 0.5779 - val_acc: 0.8870 Epoch 16/100 40/40 [==============================] - 1s 14ms/step - loss: 0.6193 - acc: 0.8659 - val_loss: 0.5223 - val_acc: 0.9037 Epoch 17/100 40/40 [==============================] - 1s 14ms/step - loss: 0.5717 - acc: 0.8817 - val_loss: 0.4976 - val_acc: 0.9074 Epoch 18/100 40/40 [==============================] - 0s 12ms/step - loss: 0.5499 - acc: 0.8821 - val_loss: 0.4625 - val_acc: 0.9148 Epoch 19/100 40/40 [==============================] - 0s 12ms/step - loss: 0.5304 - acc: 0.8889 - val_loss: 0.4586 - val_acc: 0.9111 Epoch 20/100 40/40 [==============================] - 0s 12ms/step - loss: 0.5126 - acc: 0.8980 - val_loss: 0.4526 - val_acc: 0.9185 Epoch 21/100 40/40 [==============================] - 0s 12ms/step - loss: 0.4727 - acc: 0.9103 - val_loss: 0.4202 - val_acc: 0.9259 Epoch 22/100 40/40 [==============================] - 0s 12ms/step - loss: 0.4482 - acc: 0.9131 - val_loss: 0.3959 - val_acc: 0.9333 Epoch 23/100 40/40 [==============================] - 0s 11ms/step - loss: 0.4204 - acc: 0.9266 - val_loss: 0.3537 - val_acc: 0.9426 Epoch 24/100 40/40 [==============================] - 0s 12ms/step - loss: 0.4146 - acc: 0.9202 - val_loss: 0.3512 - val_acc: 0.9463 Epoch 25/100 40/40 [==============================] - 0s 12ms/step - loss: 0.3983 - acc: 0.9194 - val_loss: 0.3614 - val_acc: 0.9296 Epoch 26/100 40/40 [==============================] - 0s 12ms/step - loss: 0.3858 - acc: 0.9286 - val_loss: 0.3673 - val_acc: 0.9389 Epoch 27/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3751 - acc: 0.9329 - val_loss: 0.3530 - val_acc: 0.9185 Epoch 28/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3572 - acc: 0.9325 - val_loss: 0.3250 - val_acc: 0.9500 Epoch 29/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3594 - acc: 0.9385 - val_loss: 0.2897 - val_acc: 0.9481 Epoch 30/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3357 - acc: 0.9417 - val_loss: 0.3015 - val_acc: 0.9574 Epoch 31/100 40/40 [==============================] - 0s 12ms/step - loss: 0.3624 - acc: 0.9373 - val_loss: 0.2819 - val_acc: 0.9648 Epoch 32/100 40/40 [==============================] - 0s 11ms/step - loss: 0.3234 - acc: 0.9476 - val_loss: 0.2992 - val_acc: 0.9593 Epoch 33/100 40/40 [==============================] - 0s 12ms/step - loss: 0.3160 - acc: 0.9440 - val_loss: 0.3452 - val_acc: 0.9296 Epoch 34/100 40/40 [==============================] - 0s 11ms/step - loss: 0.2959 - acc: 0.9496 - val_loss: 0.2725 - val_acc: 0.9685 Epoch 35/100 40/40 [==============================] - 0s 12ms/step - loss: 0.2691 - acc: 0.9623 - val_loss: 0.2474 - val_acc: 0.9611 Epoch 36/100 40/40 [==============================] - 0s 11ms/step - loss: 0.2736 - acc: 0.9623 - val_loss: 0.2739 - val_acc: 0.9667 Epoch 37/100 40/40 [==============================] - 0s 12ms/step - loss: 0.2595 - acc: 0.9659 - val_loss: 0.2597 - val_acc: 0.9537 Epoch 38/100 40/40 [==============================] - 1s 13ms/step - loss: 0.2705 - acc: 0.9532 - val_loss: 0.2643 - val_acc: 0.9574 Epoch 39/100 40/40 [==============================] - 1s 13ms/step - loss: 0.2537 - acc: 0.9619 - val_loss: 0.2590 - val_acc: 0.9556 Saved baseline model to: /tmp/tmpygld3dkk.h5
train_acc2 = history2.history['acc']
train_loss2 = history2.history['loss']
val_acc2 = history2.history['val_acc']
val_loss2 = history2.history['val_loss']
epochs2 = range(1, len(train_acc2) + 1)
plt.plot(epochs2, train_acc2, 'b', label='Training accuracy', color='darkmagenta')
plt.plot(epochs2, val_acc2, 'r', label='Validation accuracy', color='tab:orange')
plt.title('Training and validation accuracy CRNN')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/acc_function_CRNN_mfcc.png')
plt.show()
plt.plot(epochs2, train_loss2, 'b', label='Training loss', color='darkmagenta')
plt.plot(epochs2, val_loss2, 'r', label='Validation loss', color='tab:orange')
plt.title('Training and validation loss CRNN')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/loss_function_CRNN_mfcc.png')
plt.show()
<ipython-input-93-93eafaba5ee3>:8: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs2, train_acc2, 'b', label='Training accuracy', color='darkmagenta') <ipython-input-93-93eafaba5ee3>:9: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs2, val_acc2, 'r', label='Validation accuracy', color='tab:orange')
<ipython-input-93-93eafaba5ee3>:17: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs2, train_loss2, 'b', label='Training loss', color='darkmagenta') <ipython-input-93-93eafaba5ee3>:18: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs2, val_loss2, 'r', label='Validation loss', color='tab:orange')
Check accuracy on test set and all errors of the model
# Predict classes for test data
y_pred2 = model2.predict(X_test)
# Convert in the same form
y_pred2 = np.argmax(y_pred2, axis=1)
# Compare predictions to true labels
misclassified_indices = np.where(y_pred2 != y_test)[0]
print(misclassified_indices)
# evaluate the keras model
_, accuracy2 = model2.evaluate(X_test, y_cat_test)
initial_inference_time_2 = measure_inference_time(model2, X_test)
print('Accuracy: %.2f' % (accuracy2*100))
print("Time of inference:", round(initial_inference_time_2, 2))
print("Size of gzipped RNN with MFCC model: %.2f KB" % (get_gzipped_model_size(keras_file2)/ 1024))
17/17 [==============================] - 0s 4ms/step [ 32 70 92 109 173 212 223 224 237 241 244 259 272 292 311 314 366 397 414 427 446 447 455 461 467 504 516 529] 17/17 [==============================] - 0s 4ms/step - loss: 0.2790 - acc: 0.9481 17/17 [==============================] - 0s 4ms/step Accuracy: 94.81 Time of inference: 0.28 Size of gzipped RNN with MFCC model: 1100.95 KB
print(classification_report(y_test, y_pred2, target_names=mapping.keys()))
precision recall f1-score support
chainsaw 1.00 0.98 0.99 54
clock_tick 0.91 0.94 0.93 54
crackling_fire 0.91 0.91 0.91 54
crying_baby 0.98 0.98 0.98 54
dog 0.98 0.89 0.93 54
helicopter 0.94 0.94 0.94 54
rain 0.94 0.89 0.91 54
rooster 0.96 1.00 0.98 54
sea_waves 0.93 0.96 0.95 54
sneezing 0.93 0.98 0.95 54
accuracy 0.95 540
macro avg 0.95 0.95 0.95 540
weighted avg 0.95 0.95 0.95 540
r_crnn = confusion_matrix(y_test, y_pred2)
disp = ConfusionMatrixDisplay(confusion_matrix=r_crnn, display_labels=mapping)
fig, ax = plt.subplots(figsize=(6,6))
disp.plot(ax=ax)
ax.set_xticklabels(mapping.keys(), rotation='vertical')
plt.show()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/crnn_mfcc_conf_mat.png')
np.save('/content/gdrive/MyDrive/ESC-10 material/cm_crnn_mfcc.npy', r_crnn)
<Figure size 640x480 with 0 Axes>
Check a particular error and find sound
misclassifications = np.where((y_pred2 == 8) & (y_test == 6))[0]
print(misclassifications)
[ 92 311 529]
# the model mistakes some rain sounds for sea waves sounds
miscl = X_test[311]
idx = np.where(np.all(X == miscl, axis=(1, 2)))[0][0]
ipd.Audio(new_data[idx], rate=sample_rate)
In the model below, spectrograms will be used as model input. Specifically, the X will be composed of an array representing the image.
val_size = .15
test_size = .15
X_train_spec, X_test_val_spec, y_cat_train, y_cat_test_val = train_test_split(X_spec, y_cat, test_size=val_size+test_size, random_state=42, stratify=y_cat)
X_val_spec, X_test_spec, y_cat_val, y_cat_test = train_test_split(X_test_val_spec, y_cat_test_val, test_size=test_size/(test_size+val_size), random_state=42, stratify=y_cat_test_val)
print(X_train_spec.shape, X_test_spec.shape, X_val_spec.shape, y_cat_train.shape, y_cat_test.shape, y_cat_val.shape)
(2520, 224, 224, 3) (540, 224, 224, 3) (540, 224, 224, 3) (2520, 10) (540, 10) (540, 10)
y_train = np.argmax(y_cat_train, axis = 1)
y_val = np.argmax(y_cat_val, axis = 1)
y_test = np.argmax(y_cat_test, axis = 1)
model4 = Sequential([
layers.Conv2D(64, (4,4), activation = 'relu', padding='valid', strides=(2, 2), input_shape = X_train_spec.shape[1:]),
layers.MaxPooling2D(2, padding='valid'),
layers.Conv2D(128, (3,3), activation='relu', padding='valid', strides=(2, 2), kernel_regularizer=rg.l2(0.01)),
layers.MaxPooling2D(2, padding='valid'),
layers.Conv2D(64, (2,2), activation='relu',padding='valid', kernel_regularizer=rg.l2(0.01)),
layers.MaxPooling2D(2, padding='valid'),
layers.Flatten(),
layers.Dense(64, activation='relu'),
layers.Dropout(0.4),
layers.Dense(10, activation = 'softmax')
])
model4.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'acc')
model4.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_3 (Conv2D) (None, 111, 111, 64) 3136
max_pooling2d_3 (MaxPooling (None, 55, 55, 64) 0
2D)
conv2d_4 (Conv2D) (None, 27, 27, 128) 73856
max_pooling2d_4 (MaxPooling (None, 13, 13, 128) 0
2D)
conv2d_5 (Conv2D) (None, 12, 12, 64) 32832
max_pooling2d_5 (MaxPooling (None, 6, 6, 64) 0
2D)
flatten_1 (Flatten) (None, 2304) 0
dense_2 (Dense) (None, 64) 147520
dropout_1 (Dropout) (None, 64) 0
dense_3 (Dense) (None, 10) 650
=================================================================
Total params: 257,994
Trainable params: 257,994
Non-trainable params: 0
_________________________________________________________________
# Train model
history4 = model4.fit(X_train_spec, y_cat_train, validation_data=(X_val_spec, y_cat_val), batch_size = 64, epochs=80, shuffle=False)
# Evaluate model
score4 = model4.evaluate(X_val_spec, y_cat_val, verbose=0)
# Save model
model4.save('/content/gdrive/MyDrive/models/cnn_spec.h5')
_, keras_file4 = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model4, keras_file4, include_optimizer=False)
print('Saved baseline model to:', keras_file4)
Epoch 1/80 40/40 [==============================] - 4s 48ms/step - loss: 3.0883 - acc: 0.1905 - val_loss: 2.3199 - val_acc: 0.3204 Epoch 2/80 40/40 [==============================] - 1s 29ms/step - loss: 2.1042 - acc: 0.3226 - val_loss: 1.8466 - val_acc: 0.3704 Epoch 3/80 40/40 [==============================] - 1s 27ms/step - loss: 1.7437 - acc: 0.4159 - val_loss: 1.4628 - val_acc: 0.5537 Epoch 4/80 40/40 [==============================] - 1s 27ms/step - loss: 1.4759 - acc: 0.5179 - val_loss: 1.2167 - val_acc: 0.6648 Epoch 5/80 40/40 [==============================] - 1s 28ms/step - loss: 1.2936 - acc: 0.6012 - val_loss: 1.0664 - val_acc: 0.7315 Epoch 6/80 40/40 [==============================] - 1s 28ms/step - loss: 1.1419 - acc: 0.6369 - val_loss: 0.9102 - val_acc: 0.7833 Epoch 7/80 40/40 [==============================] - 1s 28ms/step - loss: 1.0544 - acc: 0.6758 - val_loss: 0.8743 - val_acc: 0.7519 Epoch 8/80 40/40 [==============================] - 1s 28ms/step - loss: 0.9834 - acc: 0.7032 - val_loss: 0.7957 - val_acc: 0.8148 Epoch 9/80 40/40 [==============================] - 1s 29ms/step - loss: 0.9296 - acc: 0.7258 - val_loss: 0.7494 - val_acc: 0.8019 Epoch 10/80 40/40 [==============================] - 1s 28ms/step - loss: 0.8709 - acc: 0.7444 - val_loss: 0.6914 - val_acc: 0.8148 Epoch 11/80 40/40 [==============================] - 1s 29ms/step - loss: 0.8289 - acc: 0.7679 - val_loss: 0.6973 - val_acc: 0.8407 Epoch 12/80 40/40 [==============================] - 1s 30ms/step - loss: 0.7777 - acc: 0.7857 - val_loss: 0.8692 - val_acc: 0.7519 Epoch 13/80 40/40 [==============================] - 1s 30ms/step - loss: 0.7764 - acc: 0.7718 - val_loss: 0.6826 - val_acc: 0.8333 Epoch 14/80 40/40 [==============================] - 1s 30ms/step - loss: 0.7066 - acc: 0.8079 - val_loss: 0.5767 - val_acc: 0.8667 Epoch 15/80 40/40 [==============================] - 1s 28ms/step - loss: 0.6988 - acc: 0.8028 - val_loss: 0.6990 - val_acc: 0.8185 Epoch 16/80 40/40 [==============================] - 1s 27ms/step - loss: 0.6925 - acc: 0.8135 - val_loss: 0.5710 - val_acc: 0.8630 Epoch 17/80 40/40 [==============================] - 1s 28ms/step - loss: 0.6410 - acc: 0.8238 - val_loss: 0.5725 - val_acc: 0.8463 Epoch 18/80 40/40 [==============================] - 1s 28ms/step - loss: 0.6147 - acc: 0.8381 - val_loss: 0.5526 - val_acc: 0.8593 Epoch 19/80 40/40 [==============================] - 1s 28ms/step - loss: 0.5989 - acc: 0.8421 - val_loss: 0.5482 - val_acc: 0.8630 Epoch 20/80 40/40 [==============================] - 1s 27ms/step - loss: 0.6401 - acc: 0.8218 - val_loss: 0.5249 - val_acc: 0.8685 Epoch 21/80 40/40 [==============================] - 1s 28ms/step - loss: 0.5604 - acc: 0.8563 - val_loss: 0.4994 - val_acc: 0.8685 Epoch 22/80 40/40 [==============================] - 1s 28ms/step - loss: 0.5620 - acc: 0.8599 - val_loss: 0.4802 - val_acc: 0.8685 Epoch 23/80 40/40 [==============================] - 1s 28ms/step - loss: 0.5251 - acc: 0.8623 - val_loss: 0.4818 - val_acc: 0.8796 Epoch 24/80 40/40 [==============================] - 1s 31ms/step - loss: 0.5424 - acc: 0.8516 - val_loss: 0.4933 - val_acc: 0.8815 Epoch 25/80 40/40 [==============================] - 1s 30ms/step - loss: 0.5106 - acc: 0.8679 - val_loss: 0.4627 - val_acc: 0.8907 Epoch 26/80 40/40 [==============================] - 1s 30ms/step - loss: 0.5731 - acc: 0.8484 - val_loss: 0.4734 - val_acc: 0.8741 Epoch 27/80 40/40 [==============================] - 1s 30ms/step - loss: 0.4992 - acc: 0.8702 - val_loss: 0.4315 - val_acc: 0.8981 Epoch 28/80 40/40 [==============================] - 1s 28ms/step - loss: 0.4836 - acc: 0.8786 - val_loss: 0.4328 - val_acc: 0.9037 Epoch 29/80 40/40 [==============================] - 1s 27ms/step - loss: 0.5079 - acc: 0.8619 - val_loss: 0.4957 - val_acc: 0.8759 Epoch 30/80 40/40 [==============================] - 1s 31ms/step - loss: 0.4851 - acc: 0.8821 - val_loss: 0.4323 - val_acc: 0.8907 Epoch 31/80 40/40 [==============================] - 1s 33ms/step - loss: 0.4588 - acc: 0.8857 - val_loss: 0.4566 - val_acc: 0.9000 Epoch 32/80 40/40 [==============================] - 1s 34ms/step - loss: 0.4403 - acc: 0.8869 - val_loss: 0.4064 - val_acc: 0.9093 Epoch 33/80 40/40 [==============================] - 1s 32ms/step - loss: 0.4623 - acc: 0.8817 - val_loss: 0.4304 - val_acc: 0.9019 Epoch 34/80 40/40 [==============================] - 1s 31ms/step - loss: 0.4407 - acc: 0.8813 - val_loss: 0.4290 - val_acc: 0.8963 Epoch 35/80 40/40 [==============================] - 1s 35ms/step - loss: 0.4392 - acc: 0.8917 - val_loss: 0.4832 - val_acc: 0.8741 Epoch 36/80 40/40 [==============================] - 1s 35ms/step - loss: 0.4323 - acc: 0.8913 - val_loss: 0.4052 - val_acc: 0.9056 Epoch 37/80 40/40 [==============================] - 1s 36ms/step - loss: 0.4221 - acc: 0.8940 - val_loss: 0.4157 - val_acc: 0.9019 Epoch 38/80 40/40 [==============================] - 1s 35ms/step - loss: 0.4355 - acc: 0.8849 - val_loss: 0.4293 - val_acc: 0.8981 Epoch 39/80 40/40 [==============================] - 1s 36ms/step - loss: 0.4121 - acc: 0.8996 - val_loss: 0.4170 - val_acc: 0.8833 Epoch 40/80 40/40 [==============================] - 1s 29ms/step - loss: 0.4352 - acc: 0.8849 - val_loss: 0.4446 - val_acc: 0.8926 Epoch 41/80 40/40 [==============================] - 1s 28ms/step - loss: 0.4214 - acc: 0.8917 - val_loss: 0.3789 - val_acc: 0.9148 Epoch 42/80 40/40 [==============================] - 1s 27ms/step - loss: 0.3923 - acc: 0.9044 - val_loss: 0.4260 - val_acc: 0.8963 Epoch 43/80 40/40 [==============================] - 1s 28ms/step - loss: 0.3932 - acc: 0.9012 - val_loss: 0.3866 - val_acc: 0.9185 Epoch 44/80 40/40 [==============================] - 1s 28ms/step - loss: 0.3751 - acc: 0.9143 - val_loss: 0.4109 - val_acc: 0.8944 Epoch 45/80 40/40 [==============================] - 1s 32ms/step - loss: 0.4170 - acc: 0.8913 - val_loss: 0.3765 - val_acc: 0.9093 Epoch 46/80 40/40 [==============================] - 1s 36ms/step - loss: 0.4011 - acc: 0.9004 - val_loss: 0.3763 - val_acc: 0.9000 Epoch 47/80 40/40 [==============================] - 1s 30ms/step - loss: 0.3703 - acc: 0.9087 - val_loss: 0.4253 - val_acc: 0.8981 Epoch 48/80 40/40 [==============================] - 1s 31ms/step - loss: 0.3657 - acc: 0.9127 - val_loss: 0.3767 - val_acc: 0.9185 Epoch 49/80 40/40 [==============================] - 2s 43ms/step - loss: 0.3689 - acc: 0.9060 - val_loss: 0.3906 - val_acc: 0.9000 Epoch 50/80 40/40 [==============================] - 2s 43ms/step - loss: 0.3574 - acc: 0.9111 - val_loss: 0.4127 - val_acc: 0.8963 Epoch 51/80 40/40 [==============================] - 2s 40ms/step - loss: 0.3906 - acc: 0.9028 - val_loss: 0.4094 - val_acc: 0.9056 Epoch 52/80 40/40 [==============================] - 1s 35ms/step - loss: 0.3654 - acc: 0.9067 - val_loss: 0.4814 - val_acc: 0.8741 Epoch 53/80 40/40 [==============================] - 2s 39ms/step - loss: 0.3864 - acc: 0.9020 - val_loss: 0.3943 - val_acc: 0.8944 Epoch 54/80 40/40 [==============================] - 1s 33ms/step - loss: 0.3574 - acc: 0.9127 - val_loss: 0.3514 - val_acc: 0.9185 Epoch 55/80 40/40 [==============================] - 1s 32ms/step - loss: 0.3728 - acc: 0.8996 - val_loss: 0.3638 - val_acc: 0.9204 Epoch 56/80 40/40 [==============================] - 2s 39ms/step - loss: 0.3516 - acc: 0.9087 - val_loss: 0.3870 - val_acc: 0.9037 Epoch 57/80 40/40 [==============================] - 1s 33ms/step - loss: 0.3253 - acc: 0.9278 - val_loss: 0.3893 - val_acc: 0.9019 Epoch 58/80 40/40 [==============================] - 1s 37ms/step - loss: 0.3296 - acc: 0.9230 - val_loss: 0.3411 - val_acc: 0.9241 Epoch 59/80 40/40 [==============================] - 1s 34ms/step - loss: 0.3082 - acc: 0.9298 - val_loss: 0.4104 - val_acc: 0.9019 Epoch 60/80 40/40 [==============================] - 2s 40ms/step - loss: 0.3181 - acc: 0.9210 - val_loss: 0.4502 - val_acc: 0.8981 Epoch 61/80 40/40 [==============================] - 2s 40ms/step - loss: 0.3236 - acc: 0.9254 - val_loss: 0.4177 - val_acc: 0.8815 Epoch 62/80 40/40 [==============================] - 1s 35ms/step - loss: 0.3106 - acc: 0.9290 - val_loss: 0.5222 - val_acc: 0.8574 Epoch 63/80 40/40 [==============================] - 1s 32ms/step - loss: 0.3364 - acc: 0.9179 - val_loss: 0.4092 - val_acc: 0.9093 Epoch 64/80 40/40 [==============================] - 1s 34ms/step - loss: 0.3057 - acc: 0.9210 - val_loss: 0.3616 - val_acc: 0.9093 Epoch 65/80 40/40 [==============================] - 1s 31ms/step - loss: 0.3075 - acc: 0.9262 - val_loss: 0.3477 - val_acc: 0.9222 Epoch 66/80 40/40 [==============================] - 1s 31ms/step - loss: 0.3261 - acc: 0.9187 - val_loss: 0.3971 - val_acc: 0.9148 Epoch 67/80 40/40 [==============================] - 1s 30ms/step - loss: 0.3383 - acc: 0.9179 - val_loss: 0.4693 - val_acc: 0.8889 Epoch 68/80 40/40 [==============================] - 1s 32ms/step - loss: 0.3182 - acc: 0.9234 - val_loss: 0.4522 - val_acc: 0.8796 Epoch 69/80 40/40 [==============================] - 1s 33ms/step - loss: 0.3045 - acc: 0.9254 - val_loss: 0.3965 - val_acc: 0.9037 Epoch 70/80 40/40 [==============================] - 1s 34ms/step - loss: 0.3145 - acc: 0.9266 - val_loss: 0.3778 - val_acc: 0.9167 Epoch 71/80 40/40 [==============================] - 1s 35ms/step - loss: 0.3233 - acc: 0.9115 - val_loss: 0.4034 - val_acc: 0.8870 Epoch 72/80 40/40 [==============================] - 1s 33ms/step - loss: 0.3147 - acc: 0.9262 - val_loss: 0.4393 - val_acc: 0.8981 Epoch 73/80 40/40 [==============================] - 1s 34ms/step - loss: 0.3317 - acc: 0.9171 - val_loss: 0.3968 - val_acc: 0.8963 Epoch 74/80 40/40 [==============================] - 1s 33ms/step - loss: 0.3148 - acc: 0.9266 - val_loss: 0.3327 - val_acc: 0.9167 Epoch 75/80 40/40 [==============================] - 1s 31ms/step - loss: 0.3178 - acc: 0.9210 - val_loss: 0.4309 - val_acc: 0.8944 Epoch 76/80 40/40 [==============================] - 1s 31ms/step - loss: 0.3440 - acc: 0.9075 - val_loss: 0.3428 - val_acc: 0.9185 Epoch 77/80 40/40 [==============================] - 1s 30ms/step - loss: 0.2915 - acc: 0.9306 - val_loss: 0.3187 - val_acc: 0.9241 Epoch 78/80 40/40 [==============================] - 1s 31ms/step - loss: 0.2900 - acc: 0.9298 - val_loss: 0.3067 - val_acc: 0.9296 Epoch 79/80 40/40 [==============================] - 1s 33ms/step - loss: 0.2918 - acc: 0.9274 - val_loss: 0.3872 - val_acc: 0.9019 Epoch 80/80 40/40 [==============================] - 1s 34ms/step - loss: 0.3072 - acc: 0.9246 - val_loss: 0.3129 - val_acc: 0.9222 Saved baseline model to: /tmp/tmplgxqpbng.h5
train_acc4 = history4.history['acc']
train_loss4 = history4.history['loss']
val_acc4 = history4.history['val_acc']
val_loss4 = history4.history['val_loss']
epochs4 = range(1, len(train_acc4) + 1)
plt.plot(epochs4, train_acc4, 'b', label='Training accuracy', color='darkmagenta')
plt.plot(epochs4, val_acc4, 'r', label='Validation accuracy', color='tab:orange')
plt.title('Training and validation accuracy CNN spectrogram')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/acc_function_CNN_spec.png')
plt.show()
plt.plot(epochs4, train_loss4, 'b', label='Training loss', color='darkmagenta')
plt.plot(epochs4, val_loss4, 'r', label='Validation loss', color='tab:orange')
plt.title('Training and validation loss CNN spectrogram')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/loss_function_CNN_spec.png')
plt.show()
<ipython-input-23-563af9654ab3>:8: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs4, train_acc4, 'b', label='Training accuracy', color='darkmagenta') <ipython-input-23-563af9654ab3>:9: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs4, val_acc4, 'r', label='Validation accuracy', color='tab:orange')
<ipython-input-23-563af9654ab3>:17: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs4, train_loss4, 'b', label='Training loss', color='darkmagenta') <ipython-input-23-563af9654ab3>:18: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs4, val_loss4, 'r', label='Validation loss', color='tab:orange')
# Predict classes for test data
y_pred4 = model4.predict(X_test_spec)
# Convert in the same form
y_pred4 = np.argmax(y_pred4, axis=1)
# Compare predictions to true labels
misclassified_indices = np.where(y_pred4 != y_test)[0]
print(misclassified_indices)
# evaluate the keras model
_, accuracy4 = model4.evaluate(X_test_spec, y_cat_test)
initial_inference_time_4 = measure_inference_time(model4, X_test_spec)
print('Accuracy: %.2f' % (accuracy4*100))
print("Time of inference:", round(initial_inference_time_4, 2))
print("Size of gzipped RNN with MFCC model: %.2f KB" % (get_gzipped_model_size(keras_file4)/ 1024))
17/17 [==============================] - 0s 9ms/step [ 2 13 19 35 38 74 118 165 167 188 210 219 224 241 262 274 292 302 350 366 372 388 398 402 413 415 427 436 447 455 463 465 479 488 504 508] 17/17 [==============================] - 0s 10ms/step - loss: 0.2924 - acc: 0.9333 17/17 [==============================] - 0s 10ms/step Accuracy: 93.33 Time of inference: 0.85 Size of gzipped RNN with MFCC model: 957.85 KB
pruning_params = {
'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(
initial_sparsity=0.3,
final_sparsity=0.6,
begin_step=0,
end_step=int((len(X_train_spec) // 128) * 10)
)
}
model_pruned = tfmot.sparsity.keras.prune_low_magnitude(model4, **pruning_params)
logdir = tempfile.mkdtemp()
callbacks = [tfmot.sparsity.keras.UpdatePruningStep()]
model_pruned.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_pruned.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
prune_low_magnitude_conv2d_ (None, 111, 111, 64) 6210
3 (PruneLowMagnitude)
prune_low_magnitude_max_poo (None, 55, 55, 64) 1
ling2d_3 (PruneLowMagnitude
)
prune_low_magnitude_conv2d_ (None, 27, 27, 128) 147586
4 (PruneLowMagnitude)
prune_low_magnitude_max_poo (None, 13, 13, 128) 1
ling2d_4 (PruneLowMagnitude
)
prune_low_magnitude_conv2d_ (None, 12, 12, 64) 65602
5 (PruneLowMagnitude)
prune_low_magnitude_max_poo (None, 6, 6, 64) 1
ling2d_5 (PruneLowMagnitude
)
prune_low_magnitude_flatten (None, 2304) 1
_1 (PruneLowMagnitude)
prune_low_magnitude_dense_2 (None, 64) 294978
(PruneLowMagnitude)
prune_low_magnitude_dropout (None, 64) 1
_1 (PruneLowMagnitude)
prune_low_magnitude_dense_3 (None, 10) 1292
(PruneLowMagnitude)
=================================================================
Total params: 515,673
Trainable params: 257,994
Non-trainable params: 257,679
_________________________________________________________________
model_pruned.fit(X_train_spec, y_cat_train, epochs=10, batch_size=64, validation_data=(X_val_spec, y_cat_val), callbacks=callbacks)
Epoch 1/10 40/40 [==============================] - 2s 57ms/step - loss: 0.2828 - accuracy: 0.9333 - val_loss: 0.3458 - val_accuracy: 0.9296 Epoch 2/10 40/40 [==============================] - 2s 39ms/step - loss: 0.2555 - accuracy: 0.9448 - val_loss: 0.3954 - val_accuracy: 0.9056 Epoch 3/10 40/40 [==============================] - 1s 31ms/step - loss: 0.3913 - accuracy: 0.8857 - val_loss: 0.3596 - val_accuracy: 0.9056 Epoch 4/10 40/40 [==============================] - 1s 30ms/step - loss: 0.3780 - accuracy: 0.9000 - val_loss: 0.3554 - val_accuracy: 0.9204 Epoch 5/10 40/40 [==============================] - 1s 30ms/step - loss: 0.3357 - accuracy: 0.9071 - val_loss: 0.3168 - val_accuracy: 0.9241 Epoch 6/10 40/40 [==============================] - 1s 30ms/step - loss: 0.3211 - accuracy: 0.9163 - val_loss: 0.3263 - val_accuracy: 0.9074 Epoch 7/10 40/40 [==============================] - 1s 30ms/step - loss: 0.2991 - accuracy: 0.9183 - val_loss: 0.3106 - val_accuracy: 0.9204 Epoch 8/10 40/40 [==============================] - 1s 30ms/step - loss: 0.2918 - accuracy: 0.9278 - val_loss: 0.2963 - val_accuracy: 0.9278 Epoch 9/10 40/40 [==============================] - 1s 31ms/step - loss: 0.2944 - accuracy: 0.9198 - val_loss: 0.3147 - val_accuracy: 0.9204 Epoch 10/10 40/40 [==============================] - 1s 32ms/step - loss: 0.3023 - accuracy: 0.9190 - val_loss: 0.3197 - val_accuracy: 0.9222
<keras.callbacks.History at 0x7efe50727310>
_, keras_file_prun = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model_pruned, keras_file_prun, include_optimizer=False)
print('Saved baseline model to:', keras_file_prun)
Saved baseline model to: /tmp/tmp1t721l0i.h5
# evaluate the pruned model
_, accuracy4_prun = model_pruned.evaluate(X_test_spec, y_cat_test)
initial_inference_time_prun = measure_inference_time(model_pruned, X_test_spec)
print('Accuracy: %.2f' % (accuracy4_prun*100))
print("Time of inference:", round(initial_inference_time_prun, 2))
print("Size of gzipped RNN with MFCC model: %.2f KB" % (get_gzipped_model_size(keras_file_prun)/ 1024))
17/17 [==============================] - 0s 11ms/step - loss: 0.2924 - accuracy: 0.9333 17/17 [==============================] - 0s 9ms/step Accuracy: 93.33 Time of inference: 0.72 Size of gzipped RNN with MFCC model: 589.68 KB
Check accuracy on test set and all errors of the model
# Predict classes for test data
y_pred4 = model_pruned.predict(X_test_spec)
# Convert in the same form
y_pred4 = np.argmax(y_pred4, axis=1)
# Compare predictions to true labels
misclassified_indices = np.where(y_pred4 != y_test)[0]
print(misclassified_indices)
17/17 [==============================] - 0s 10ms/step [ 2 13 19 35 38 74 118 165 167 188 210 219 224 241 262 274 292 302 350 366 372 388 398 402 413 415 427 436 447 455 463 465 479 488 504 508]
print(classification_report(y_test, y_pred4, target_names=mapping.keys()))
precision recall f1-score support
chainsaw 0.87 1.00 0.93 54
clock_tick 0.91 0.93 0.92 54
crackling_fire 0.89 0.91 0.90 54
crying_baby 0.91 0.98 0.95 54
dog 0.98 0.96 0.97 54
helicopter 0.86 0.93 0.89 54
rain 1.00 0.78 0.88 54
rooster 0.98 0.96 0.97 54
sea_waves 0.96 0.93 0.94 54
sneezing 1.00 0.96 0.98 54
accuracy 0.93 540
macro avg 0.94 0.93 0.93 540
weighted avg 0.94 0.93 0.93 540
r_cnn_spec = confusion_matrix(y_test, y_pred4)
disp = ConfusionMatrixDisplay(confusion_matrix=r_cnn_spec, display_labels=mapping)
fig, ax = plt.subplots(figsize=(6,6))
disp.plot(ax=ax)
ax.set_xticklabels(mapping.keys(), rotation='vertical')
plt.show()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/cnn_spec_prun_conf_mat.png')
np.save('/content/gdrive/MyDrive/ESC-10 material/cm_cnn_spec_prun.npy', r_cnn_spec)
<Figure size 640x480 with 0 Axes>
At this point of the modeling it is important to compare it with an already existing model, both from an architecture and performance point of view.
The reference paper chosen is:
"Environmental sound classification using a regularized deep convolutional neural network with data augmentation" (Mushtaq, Su)
The paper was chosen because it provides a clear exposition of the model and the results on the dataset we have chosen (ESC-10).
Below, the visual structure of the network.
This approach is directly comparable with our previous MFCC model.
model = Sequential([
layers.Conv2D(24, (5,5), activation = 'relu', padding='same', input_shape = X_train.shape[1:],kernel_regularizer=rg.l2(0.001)),
layers.MaxPooling2D(pool_size=(3, 3), strides=(3, 3), padding='same'),
layers.Conv2D(36, (4,4), activation='relu', kernel_regularizer=rg.l2(0.001)),
layers.MaxPooling2D(2, padding='same'),
layers.Conv2D(48, (3,3), activation='relu'),
layers.Flatten(),
layers.Dense(60, activation='relu'),
layers.Dropout(0.5),
layers.Dense(10, activation = 'softmax')
])
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
Model: "sequential_8"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_21 (Conv2D) (None, 60, 216, 24) 1824
max_pooling2d_18 (MaxPoolin (None, 20, 72, 24) 0
g2D)
conv2d_22 (Conv2D) (None, 17, 69, 36) 13860
max_pooling2d_19 (MaxPoolin (None, 9, 35, 36) 0
g2D)
conv2d_23 (Conv2D) (None, 7, 33, 48) 15600
flatten_7 (Flatten) (None, 11088) 0
dense_14 (Dense) (None, 60) 665340
dropout_23 (Dropout) (None, 60) 0
dense_15 (Dense) (None, 10) 610
=================================================================
Total params: 697,234
Trainable params: 697,234
Non-trainable params: 0
_________________________________________________________________
# Train model
history5 = model.fit(X_train, y_cat_train, batch_size=32, epochs=100, validation_data=(X_val, y_cat_val), shuffle=False)
# Evaluate model
score5 = model.evaluate(X_val, y_cat_val, verbose=0)
# Save model
model.save('/content/gdrive/MyDrive/models/cnn_mfcc.h5')
_, keras_file = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model, keras_file, include_optimizer=False)
print('Saved baseline model to:', keras_file)
Epoch 1/100 79/79 [==============================] - 4s 15ms/step - loss: 2.0083 - accuracy: 0.3282 - val_loss: 1.4398 - val_accuracy: 0.4685 Epoch 2/100 79/79 [==============================] - 1s 9ms/step - loss: 1.3320 - accuracy: 0.5313 - val_loss: 0.9852 - val_accuracy: 0.7130 Epoch 3/100 79/79 [==============================] - 1s 8ms/step - loss: 1.0714 - accuracy: 0.6409 - val_loss: 0.7107 - val_accuracy: 0.8000 Epoch 4/100 79/79 [==============================] - 1s 8ms/step - loss: 0.9499 - accuracy: 0.6857 - val_loss: 0.6186 - val_accuracy: 0.8296 Epoch 5/100 79/79 [==============================] - 1s 8ms/step - loss: 0.7898 - accuracy: 0.7405 - val_loss: 0.4868 - val_accuracy: 0.8722 Epoch 6/100 79/79 [==============================] - 1s 8ms/step - loss: 0.6877 - accuracy: 0.7706 - val_loss: 0.5487 - val_accuracy: 0.8370 Epoch 7/100 79/79 [==============================] - 1s 8ms/step - loss: 0.6367 - accuracy: 0.7841 - val_loss: 0.4229 - val_accuracy: 0.8741 Epoch 8/100 79/79 [==============================] - 1s 8ms/step - loss: 0.5367 - accuracy: 0.8190 - val_loss: 0.2886 - val_accuracy: 0.9222 Epoch 9/100 79/79 [==============================] - 1s 8ms/step - loss: 0.5358 - accuracy: 0.8246 - val_loss: 0.2867 - val_accuracy: 0.9130 Epoch 10/100 79/79 [==============================] - 1s 9ms/step - loss: 0.4512 - accuracy: 0.8520 - val_loss: 0.3001 - val_accuracy: 0.9074 Epoch 11/100 79/79 [==============================] - 1s 7ms/step - loss: 0.4742 - accuracy: 0.8516 - val_loss: 0.2915 - val_accuracy: 0.9278 Epoch 12/100 79/79 [==============================] - 1s 7ms/step - loss: 0.4228 - accuracy: 0.8655 - val_loss: 0.2321 - val_accuracy: 0.9296 Epoch 13/100 79/79 [==============================] - 1s 7ms/step - loss: 0.3950 - accuracy: 0.8706 - val_loss: 0.2505 - val_accuracy: 0.9241 Epoch 14/100 79/79 [==============================] - 1s 7ms/step - loss: 0.3853 - accuracy: 0.8698 - val_loss: 0.1971 - val_accuracy: 0.9500 Epoch 15/100 79/79 [==============================] - 1s 7ms/step - loss: 0.3037 - accuracy: 0.9024 - val_loss: 0.2134 - val_accuracy: 0.9296 Epoch 16/100 79/79 [==============================] - 1s 7ms/step - loss: 0.3370 - accuracy: 0.8853 - val_loss: 0.2727 - val_accuracy: 0.9259 Epoch 17/100 79/79 [==============================] - 1s 7ms/step - loss: 0.3518 - accuracy: 0.8885 - val_loss: 0.2365 - val_accuracy: 0.9389 Epoch 18/100 79/79 [==============================] - 1s 7ms/step - loss: 0.2762 - accuracy: 0.9083 - val_loss: 0.2078 - val_accuracy: 0.9500 Epoch 19/100 79/79 [==============================] - 1s 7ms/step - loss: 0.2695 - accuracy: 0.9155 - val_loss: 0.1902 - val_accuracy: 0.9500 Epoch 20/100 79/79 [==============================] - 1s 7ms/step - loss: 0.2620 - accuracy: 0.9135 - val_loss: 0.1829 - val_accuracy: 0.9519 Epoch 21/100 79/79 [==============================] - 1s 7ms/step - loss: 0.2701 - accuracy: 0.9115 - val_loss: 0.2854 - val_accuracy: 0.9333 Epoch 22/100 79/79 [==============================] - 1s 7ms/step - loss: 0.2391 - accuracy: 0.9194 - val_loss: 0.2249 - val_accuracy: 0.9389 Epoch 23/100 79/79 [==============================] - 1s 7ms/step - loss: 0.2270 - accuracy: 0.9242 - val_loss: 0.1605 - val_accuracy: 0.9611 Epoch 24/100 79/79 [==============================] - 1s 7ms/step - loss: 0.2105 - accuracy: 0.9377 - val_loss: 0.1956 - val_accuracy: 0.9574 Epoch 25/100 79/79 [==============================] - 1s 7ms/step - loss: 0.2194 - accuracy: 0.9262 - val_loss: 0.1642 - val_accuracy: 0.9556 Epoch 26/100 79/79 [==============================] - 1s 8ms/step - loss: 0.2250 - accuracy: 0.9306 - val_loss: 0.2999 - val_accuracy: 0.9333 Epoch 27/100 79/79 [==============================] - 1s 8ms/step - loss: 0.2492 - accuracy: 0.9262 - val_loss: 0.2549 - val_accuracy: 0.9574 Epoch 28/100 79/79 [==============================] - 1s 9ms/step - loss: 0.2071 - accuracy: 0.9317 - val_loss: 0.1838 - val_accuracy: 0.9519 Epoch 29/100 79/79 [==============================] - 1s 8ms/step - loss: 0.2230 - accuracy: 0.9321 - val_loss: 0.2329 - val_accuracy: 0.9537 Epoch 30/100 79/79 [==============================] - 1s 8ms/step - loss: 0.2050 - accuracy: 0.9361 - val_loss: 0.2901 - val_accuracy: 0.9407 Epoch 31/100 79/79 [==============================] - 1s 8ms/step - loss: 0.2007 - accuracy: 0.9353 - val_loss: 0.2174 - val_accuracy: 0.9556 Epoch 32/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1716 - accuracy: 0.9480 - val_loss: 0.1904 - val_accuracy: 0.9593 Epoch 33/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1586 - accuracy: 0.9540 - val_loss: 0.2187 - val_accuracy: 0.9667 Epoch 34/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1490 - accuracy: 0.9611 - val_loss: 0.2142 - val_accuracy: 0.9574 Epoch 35/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1529 - accuracy: 0.9528 - val_loss: 0.2313 - val_accuracy: 0.9630 Epoch 36/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1836 - accuracy: 0.9460 - val_loss: 0.2245 - val_accuracy: 0.9537 Epoch 37/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1700 - accuracy: 0.9516 - val_loss: 0.3193 - val_accuracy: 0.9389 Epoch 38/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1788 - accuracy: 0.9520 - val_loss: 0.2376 - val_accuracy: 0.9481 Epoch 39/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1832 - accuracy: 0.9488 - val_loss: 0.1988 - val_accuracy: 0.9611 Epoch 40/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1598 - accuracy: 0.9536 - val_loss: 0.2704 - val_accuracy: 0.9352 Epoch 41/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1618 - accuracy: 0.9496 - val_loss: 0.2222 - val_accuracy: 0.9574 Epoch 42/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1391 - accuracy: 0.9643 - val_loss: 0.2602 - val_accuracy: 0.9593 Epoch 43/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1361 - accuracy: 0.9579 - val_loss: 0.2215 - val_accuracy: 0.9667 Epoch 44/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1357 - accuracy: 0.9587 - val_loss: 0.3584 - val_accuracy: 0.9574 Epoch 45/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1647 - accuracy: 0.9520 - val_loss: 0.2481 - val_accuracy: 0.9630 Epoch 46/100 79/79 [==============================] - 1s 7ms/step - loss: 0.2580 - accuracy: 0.9333 - val_loss: 0.4972 - val_accuracy: 0.9019 Epoch 47/100 79/79 [==============================] - 1s 7ms/step - loss: 0.3041 - accuracy: 0.9234 - val_loss: 0.3450 - val_accuracy: 0.9481 Epoch 48/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1903 - accuracy: 0.9488 - val_loss: 0.4909 - val_accuracy: 0.9278 Epoch 49/100 79/79 [==============================] - 1s 8ms/step - loss: 0.2592 - accuracy: 0.9353 - val_loss: 0.2290 - val_accuracy: 0.9519 Epoch 50/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1616 - accuracy: 0.9524 - val_loss: 0.2321 - val_accuracy: 0.9574 Epoch 51/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1215 - accuracy: 0.9671 - val_loss: 0.2755 - val_accuracy: 0.9611 Epoch 52/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1129 - accuracy: 0.9687 - val_loss: 0.2360 - val_accuracy: 0.9556 Epoch 53/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1292 - accuracy: 0.9687 - val_loss: 0.3313 - val_accuracy: 0.9426 Epoch 54/100 79/79 [==============================] - 1s 9ms/step - loss: 0.1205 - accuracy: 0.9718 - val_loss: 0.2689 - val_accuracy: 0.9537 Epoch 55/100 79/79 [==============================] - 1s 9ms/step - loss: 0.1496 - accuracy: 0.9560 - val_loss: 0.2261 - val_accuracy: 0.9574 Epoch 56/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1213 - accuracy: 0.9643 - val_loss: 0.2547 - val_accuracy: 0.9648 Epoch 57/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1203 - accuracy: 0.9698 - val_loss: 0.2774 - val_accuracy: 0.9611 Epoch 58/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1343 - accuracy: 0.9635 - val_loss: 0.2360 - val_accuracy: 0.9648 Epoch 59/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1185 - accuracy: 0.9655 - val_loss: 0.2751 - val_accuracy: 0.9648 Epoch 60/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1311 - accuracy: 0.9631 - val_loss: 0.3814 - val_accuracy: 0.9407 Epoch 61/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1161 - accuracy: 0.9690 - val_loss: 0.2749 - val_accuracy: 0.9556 Epoch 62/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1356 - accuracy: 0.9651 - val_loss: 0.3021 - val_accuracy: 0.9463 Epoch 63/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1118 - accuracy: 0.9667 - val_loss: 0.2775 - val_accuracy: 0.9500 Epoch 64/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1390 - accuracy: 0.9607 - val_loss: 0.4720 - val_accuracy: 0.9204 Epoch 65/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1561 - accuracy: 0.9587 - val_loss: 0.3540 - val_accuracy: 0.9278 Epoch 66/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1318 - accuracy: 0.9639 - val_loss: 0.2839 - val_accuracy: 0.9574 Epoch 67/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1343 - accuracy: 0.9647 - val_loss: 0.3059 - val_accuracy: 0.9537 Epoch 68/100 79/79 [==============================] - 1s 7ms/step - loss: 0.0976 - accuracy: 0.9762 - val_loss: 0.2967 - val_accuracy: 0.9537 Epoch 69/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1019 - accuracy: 0.9738 - val_loss: 0.2806 - val_accuracy: 0.9667 Epoch 70/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1588 - accuracy: 0.9619 - val_loss: 0.5143 - val_accuracy: 0.9222 Epoch 71/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1828 - accuracy: 0.9556 - val_loss: 0.3103 - val_accuracy: 0.9389 Epoch 72/100 79/79 [==============================] - 1s 7ms/step - loss: 0.1087 - accuracy: 0.9730 - val_loss: 0.3866 - val_accuracy: 0.9426 Epoch 73/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1186 - accuracy: 0.9734 - val_loss: 0.3017 - val_accuracy: 0.9574 Epoch 74/100 79/79 [==============================] - 1s 8ms/step - loss: 0.0974 - accuracy: 0.9770 - val_loss: 0.3312 - val_accuracy: 0.9574 Epoch 75/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1206 - accuracy: 0.9702 - val_loss: 0.2891 - val_accuracy: 0.9519 Epoch 76/100 79/79 [==============================] - 1s 8ms/step - loss: 0.0943 - accuracy: 0.9746 - val_loss: 0.2659 - val_accuracy: 0.9519 Epoch 77/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1042 - accuracy: 0.9698 - val_loss: 0.2936 - val_accuracy: 0.9500 Epoch 78/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1013 - accuracy: 0.9718 - val_loss: 0.2982 - val_accuracy: 0.9593 Epoch 79/100 79/79 [==============================] - 1s 8ms/step - loss: 0.0840 - accuracy: 0.9778 - val_loss: 0.3018 - val_accuracy: 0.9537 Epoch 80/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1147 - accuracy: 0.9746 - val_loss: 0.2495 - val_accuracy: 0.9519 Epoch 81/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1192 - accuracy: 0.9663 - val_loss: 0.2332 - val_accuracy: 0.9593 Epoch 82/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1374 - accuracy: 0.9667 - val_loss: 0.2407 - val_accuracy: 0.9611 Epoch 83/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1518 - accuracy: 0.9615 - val_loss: 0.3509 - val_accuracy: 0.9574 Epoch 84/100 79/79 [==============================] - 1s 9ms/step - loss: 0.1103 - accuracy: 0.9694 - val_loss: 0.2627 - val_accuracy: 0.9611 Epoch 85/100 79/79 [==============================] - 1s 9ms/step - loss: 0.1049 - accuracy: 0.9750 - val_loss: 0.3137 - val_accuracy: 0.9593 Epoch 86/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1012 - accuracy: 0.9766 - val_loss: 0.3952 - val_accuracy: 0.9481 Epoch 87/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1136 - accuracy: 0.9714 - val_loss: 0.2883 - val_accuracy: 0.9519 Epoch 88/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1630 - accuracy: 0.9659 - val_loss: 0.2897 - val_accuracy: 0.9444 Epoch 89/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1381 - accuracy: 0.9667 - val_loss: 0.2177 - val_accuracy: 0.9630 Epoch 90/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1352 - accuracy: 0.9683 - val_loss: 0.2819 - val_accuracy: 0.9500 Epoch 91/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1023 - accuracy: 0.9778 - val_loss: 0.2583 - val_accuracy: 0.9685 Epoch 92/100 79/79 [==============================] - 1s 8ms/step - loss: 0.0951 - accuracy: 0.9782 - val_loss: 0.2669 - val_accuracy: 0.9667 Epoch 93/100 79/79 [==============================] - 1s 8ms/step - loss: 0.0972 - accuracy: 0.9742 - val_loss: 0.2175 - val_accuracy: 0.9704 Epoch 94/100 79/79 [==============================] - 1s 8ms/step - loss: 0.0786 - accuracy: 0.9817 - val_loss: 0.2863 - val_accuracy: 0.9685 Epoch 95/100 79/79 [==============================] - 1s 8ms/step - loss: 0.0911 - accuracy: 0.9766 - val_loss: 0.3138 - val_accuracy: 0.9556 Epoch 96/100 79/79 [==============================] - 1s 9ms/step - loss: 0.0771 - accuracy: 0.9821 - val_loss: 0.3020 - val_accuracy: 0.9537 Epoch 97/100 79/79 [==============================] - 1s 8ms/step - loss: 0.0726 - accuracy: 0.9869 - val_loss: 0.2738 - val_accuracy: 0.9667 Epoch 98/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1047 - accuracy: 0.9730 - val_loss: 0.3224 - val_accuracy: 0.9537 Epoch 99/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1269 - accuracy: 0.9758 - val_loss: 0.4634 - val_accuracy: 0.9519 Epoch 100/100 79/79 [==============================] - 1s 8ms/step - loss: 0.1607 - accuracy: 0.9563 - val_loss: 0.3232 - val_accuracy: 0.9481 Saved baseline model to: /tmp/tmp90fqfgo6.h5
train_acc5 = history5.history['accuracy']
train_loss5 = history5.history['loss']
val_acc5 = history5.history['val_accuracy']
val_loss5 = history5.history['val_loss']
epochs5 = range(1, len(train_acc5) + 1)
plt.plot(epochs5, train_acc5, 'b', label='Training accuracy', color='darkmagenta')
plt.plot(epochs5, val_acc5, 'r', label='Validation accuracy', color='tab:orange')
plt.title('Training and validation accuracy CNN')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/acc_function_paper.png')
plt.show()
plt.plot(epochs5, train_loss5, 'b', label='Training loss', color='darkmagenta')
plt.plot(epochs5, val_loss5, 'r', label='Validation loss', color='tab:orange')
plt.title('Training and validation loss CNN')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/loss_function_paper.png')
plt.show()
<ipython-input-105-7be962f3d892>:8: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs5, train_acc5, 'b', label='Training accuracy', color='darkmagenta') <ipython-input-105-7be962f3d892>:9: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs5, val_acc5, 'r', label='Validation accuracy', color='tab:orange')
<ipython-input-105-7be962f3d892>:17: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "b" (-> color=(0.0, 0.0, 1.0, 1)). The keyword argument will take precedence. plt.plot(epochs5, train_loss5, 'b', label='Training loss', color='darkmagenta') <ipython-input-105-7be962f3d892>:18: UserWarning: color is redundantly defined by the 'color' keyword argument and the fmt string "r" (-> color=(1.0, 0.0, 0.0, 1)). The keyword argument will take precedence. plt.plot(epochs5, val_loss5, 'r', label='Validation loss', color='tab:orange')
# Predict classes for test data
y_pred5 = model.predict(X_test)
# Convert in the same form
y_pred5 = np.argmax(y_pred5, axis=1)
# Compare predictions to true labels
misclassified_indices = np.where(y_pred5 != y_test)[0]
print(misclassified_indices)
# evaluate the keras model
_, accuracy5 = model.evaluate(X_test, y_cat_test)
initial_inference_time_5 = measure_inference_time(model, X_test)
print('Accuracy: %.2f' % (accuracy5*100))
print("Time of inference:", round(initial_inference_time_5, 2))
print("Size of gzipped RNN with MFCC model: %.2f KB" % (get_gzipped_model_size(keras_file)/ 1024))
17/17 [==============================] - 0s 4ms/step [ 6 59 74 84 104 116 137 150 153 165 167 181 204 221 224 232 233 236 241 244 259 272 274 292 342 346 369 392 397 415 427 446 447 455 461 465 524 530] 17/17 [==============================] - 0s 5ms/step - loss: 0.3820 - accuracy: 0.9296 17/17 [==============================] - 0s 3ms/step Accuracy: 92.96 Time of inference: 0.36 Size of gzipped RNN with MFCC model: 2545.88 KB
print(classification_report(y_test, y_pred5, target_names=mapping.keys()))
precision recall f1-score support
chainsaw 0.95 1.00 0.97 54
clock_tick 0.84 1.00 0.92 54
crackling_fire 1.00 0.78 0.88 54
crying_baby 0.98 0.96 0.97 54
dog 0.94 0.91 0.92 54
helicopter 0.88 0.85 0.87 54
rain 0.83 0.93 0.88 54
rooster 0.96 1.00 0.98 54
sea_waves 1.00 0.94 0.97 54
sneezing 0.94 0.93 0.93 54
accuracy 0.93 540
macro avg 0.93 0.93 0.93 540
weighted avg 0.93 0.93 0.93 540
The paper model is of less complexity: it has fewer parameters and fewer layers. It has a higher level of overfitting than our model and a shorter execution time. The accuracy achieved is slightly lower than that achieved by our model.
r_cnn = np.load('/content/gdrive/MyDrive/ESC-10 material/cm_cnn_mfcc.npy')
r_rnn = np.load('/content/gdrive/MyDrive/ESC-10 material/cm_rnn_mfcc.npy')
r_crnn = np.load('/content/gdrive/MyDrive/ESC-10 material/cm_crnn_mfcc.npy')
r_cnn_spec = np.load('/content/gdrive/MyDrive/ESC-10 material/cm_cnn_spec_prun.npy')
class_acc_mfcc_cnn = np.round((np.diag(r_cnn) / np.sum(r_cnn, axis=1)), 2)
class_acc_mfcc_rnn = np.round((np.diag(r_rnn) / np.sum(r_rnn, axis=1)), 2)
class_acc_mfcc_crnn = np.round((np.diag(r_crnn) / np.sum(r_crnn, axis=1)), 2)
class_acc_spec_cnn = np.round((np.diag(r_cnn_spec) / np.sum(r_cnn_spec, axis=1)), 2)
acc_list = [class_acc_mfcc_cnn, class_acc_mfcc_rnn, class_acc_mfcc_crnn, class_acc_spec_cnn]
indexes = ['CNN_mfcc', 'RNN_mfcc', 'CRNN_mfcc', 'CNN_spec']
acc_df = pd.DataFrame(acc_list, columns=mapping.keys(), index=indexes)
acc_df
| chainsaw | clock_tick | crackling_fire | crying_baby | dog | helicopter | rain | rooster | sea_waves | sneezing | |
|---|---|---|---|---|---|---|---|---|---|---|
| CNN_mfcc | 0.96 | 0.91 | 0.91 | 1.00 | 0.93 | 0.94 | 0.80 | 1.00 | 0.98 | 0.98 |
| RNN_mfcc | 0.98 | 0.94 | 0.89 | 1.00 | 0.96 | 0.98 | 0.93 | 1.00 | 0.96 | 0.94 |
| CRNN_mfcc | 0.98 | 0.94 | 0.91 | 0.98 | 0.89 | 0.94 | 0.89 | 1.00 | 0.96 | 0.98 |
| CNN_spec | 1.00 | 0.93 | 0.91 | 0.98 | 0.96 | 0.93 | 0.78 | 0.96 | 0.93 | 0.96 |
x = np.arange(10)
class_acc_mfcc_cnn_f = list(np.around(np.array(class_acc_mfcc_cnn),2))
class_acc_mfcc_rnn_f = list(np.around(np.array(class_acc_mfcc_rnn),2))
class_acc_mfcc_crnn_f = list(np.around(np.array(class_acc_mfcc_crnn),2))
class_acc_spec_cnn_f = list(np.around(np.array(class_acc_spec_cnn), 2))
categories_f_plot = list(mapping.keys())
width = 0.15
plt.bar(x - 0.3, class_acc_mfcc_cnn_f, width, color="gold")
plt.bar(x - 0.15, class_acc_mfcc_rnn_f, width, color="limegreen")
plt.bar(x, class_acc_mfcc_crnn_f, width, color="steelblue")
plt.bar(x + 0.15, class_acc_spec_cnn_f, width, color="indigo")
plt.xticks(x, categories_f_plot, rotation=90)
plt.xlabel("Sounds")
plt.ylabel("Accuracies")
plt.legend(["CNN mfcc", "RNN mfcc", "CRNN mfcc", "CNN spectrogram"], loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
plt.savefig('/content/gdrive/MyDrive/ESC-10 material/comparison_hist.png')
<Figure size 640x480 with 0 Axes>